test: update unit test to check detected languages.

Excepting ASCII, UTF-16 and UTF-32 for which we don't detect languages
yet.
This commit is contained in:
Jehan 2021-03-17 12:39:54 +01:00
parent f30c1cd8c8
commit 26ed628061

View File

@ -35,6 +35,7 @@
*
* ***** END LICENSE BLOCK ***** */
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
@ -44,11 +45,10 @@
#define BUFFER_SIZE 65536
char *
detect(FILE *fp)
void
detect(FILE *fp, char **charset, char **lang)
{
uchardet_t handle = uchardet_new();
char *charset;
char buffer[BUFFER_SIZE];
int i;
@ -65,16 +65,18 @@ detect(FILE *fp)
}
uchardet_data_end(handle);
charset = strdup(uchardet_get_encoding(handle, 0));
for (i = 0; charset[i]; i++)
*charset = strdup(uchardet_get_encoding(handle, 0));
if (uchardet_get_language(handle, 0))
*lang = strdup(uchardet_get_language(handle, 0));
else
*lang = NULL;
for (i = 0; (*charset)[i]; i++)
{
/* Our test files are lowercase. */
charset[i] = tolower(charset[i]);
(*charset)[i] = tolower((*charset)[i]);
}
uchardet_delete(handle);
return charset;
}
int
@ -82,9 +84,13 @@ main(int argc, char ** argv)
{
FILE *f;
char *filename;
char *path;
char *expected_charset;
char *expected_lang = NULL;
char *charset;
int success;
char *lang;
/* In a unit test, 0 means success, other returned values mean failure. */
int success = 1;
if (argc != 2)
{
@ -106,24 +112,41 @@ main(int argc, char ** argv)
return 1;
}
expected_charset = strrchr(filename, '/');
if (expected_charset == NULL)
{
expected_charset = filename;
}
else
{
expected_charset++;
}
path = realpath(filename, NULL);
assert(path);
expected_charset = strrchr(path, '/');
assert(expected_charset);
*expected_charset = '\0';
expected_charset++;
expected_charset = strtok(expected_charset, ".");
charset = detect(f);
expected_lang = strrchr(path, '/');
assert(expected_lang);
expected_lang++;
detect(f, &charset, &lang);
fclose (f);
/* In a unit test, 0 means success, other returned values mean failure. */
success = (strcmp(charset, expected_charset) != 0);
/* No lang detection is a failure, except for a few charset for
* which we still don't detect languages.
* TODO.
* */
if (strcmp(expected_charset, "ascii") == 0 ||
strcmp(expected_charset, "utf-16") == 0 ||
strcmp(expected_charset, "utf-16") == 0 ||
strcmp(expected_charset, "utf-32") == 0)
{
success = (strcmp(charset, expected_charset) != 0);
}
else if (lang)
{
success = (strcmp(charset, expected_charset) != 0) +
(strcmp(lang, expected_lang) != 0);
}
free(path);
free(charset);
free(lang);
free(filename);
return success;