test: update unit test to check detected languages.

Excepting ASCII, UTF-16 and UTF-32 for which we don't detect languages
yet.
This commit is contained in:
Jehan 2021-03-17 12:39:54 +01:00
parent 82c1d2b25e
commit 1b5e68be00

View File

@ -35,6 +35,7 @@
* *
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
#include <assert.h>
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -44,11 +45,10 @@
#define BUFFER_SIZE 65536 #define BUFFER_SIZE 65536
char * void
detect(FILE *fp) detect(FILE *fp, char **charset, char **lang)
{ {
uchardet_t handle = uchardet_new(); uchardet_t handle = uchardet_new();
char *charset;
char buffer[BUFFER_SIZE]; char buffer[BUFFER_SIZE];
int i; int i;
@ -67,16 +67,18 @@ detect(FILE *fp)
} }
uchardet_data_end(handle); uchardet_data_end(handle);
charset = strdup(uchardet_get_encoding(handle, 0)); *charset = strdup(uchardet_get_encoding(handle, 0));
for (i = 0; charset[i]; i++) if (uchardet_get_language(handle, 0))
*lang = strdup(uchardet_get_language(handle, 0));
else
*lang = NULL;
for (i = 0; (*charset)[i]; i++)
{ {
/* Our test files are lowercase. */ /* Our test files are lowercase. */
charset[i] = tolower(charset[i]); (*charset)[i] = tolower((*charset)[i]);
} }
uchardet_delete(handle); uchardet_delete(handle);
return charset;
} }
int int
@ -84,9 +86,13 @@ main(int argc, char ** argv)
{ {
FILE *f; FILE *f;
char *filename; char *filename;
char *path;
char *expected_charset; char *expected_charset;
char *expected_lang = NULL;
char *charset; char *charset;
int success; char *lang;
/* In a unit test, 0 means success, other returned values mean failure. */
int success = 1;
if (argc != 2) if (argc != 2)
{ {
@ -108,27 +114,41 @@ main(int argc, char ** argv)
return 1; return 1;
} }
expected_charset = strrchr(filename, '/'); path = realpath(filename, NULL);
if (expected_charset == NULL) assert(path);
{ expected_charset = strrchr(path, '/');
expected_charset = filename; assert(expected_charset);
} *expected_charset = '\0';
else expected_charset++;
{
expected_charset++;
}
expected_charset = strtok(expected_charset, "."); expected_charset = strtok(expected_charset, ".");
charset = detect(f); expected_lang = strrchr(path, '/');
assert(expected_lang);
expected_lang++;
detect(f, &charset, &lang);
fclose (f); fclose (f);
/* In a unit test, 0 means success, other returned values mean failure. */ /* No lang detection is a failure, except for a few charset for
success = (strcmp(charset, expected_charset) != 0); * which we still don't detect languages.
if (success) { * TODO.
fprintf(stderr, "Got %s, expected %s\n", charset, expected_charset); * */
if (strcmp(expected_charset, "ascii") == 0 ||
strcmp(expected_charset, "utf-16") == 0 ||
strcmp(expected_charset, "utf-16") == 0 ||
strcmp(expected_charset, "utf-32") == 0)
{
success = (strcmp(charset, expected_charset) != 0);
}
else if (lang)
{
success = (strcmp(charset, expected_charset) != 0) +
(strcmp(lang, expected_lang) != 0);
} }
free(path);
free(charset); free(charset);
free(lang);
free(filename); free(filename);
return success; return success;