src: add a --weight option to the CLI tool.

Syntax is: lang1:weight1,lang2:weight2…
For instance: `uchardet -wfr:1.1,it:1.05 file.txt` if you think a file
is probably French or maybe Italian.
This commit is contained in:
Jehan 2020-04-27 18:14:34 +02:00
parent 7f99b91388
commit 82347030ba

View File

@ -47,11 +47,10 @@
char buffer[BUFFER_SIZE]; char buffer[BUFFER_SIZE];
void detect(FILE *fp, void detect(uchardet_t handle,
bool verbose) FILE *fp,
bool verbose)
{ {
uchardet_t handle = uchardet_new();
while (!feof(fp)) while (!feof(fp))
{ {
size_t len = fread(buffer, 1, BUFFER_SIZE, fp); size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
@ -91,8 +90,8 @@ void detect(FILE *fp,
else else
printf("unknown\n"); printf("unknown\n");
} }
uchardet_delete(handle); uchardet_reset(handle);
} }
void show_version() void show_version()
@ -116,37 +115,68 @@ void show_usage()
printf(" -v, --version Print version and build information.\n"); printf(" -v, --version Print version and build information.\n");
printf(" -h, --help Print this help.\n"); printf(" -h, --help Print this help.\n");
printf(" -V, --verbose Show all candidates and their confidence value.\n"); printf(" -V, --verbose Show all candidates and their confidence value.\n");
printf(" -w, --weight Tweak language weights.\n");
printf("\n"); printf("\n");
} }
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
uchardet_t handle;
static struct option longopts[] = static struct option longopts[] =
{ {
{ "version", no_argument, NULL, 'v' }, { "version", no_argument, NULL, 'v' },
{ "help", no_argument, NULL, 'h' }, { "help", no_argument, NULL, 'h' },
{ "verbose", no_argument, NULL, 'V' }, { "verbose", no_argument, NULL, 'V' },
{ "weight", required_argument, NULL, 'w' },
{ 0, 0, 0, 0 }, { 0, 0, 0, 0 },
}; };
bool end_options = false; bool end_options = false;
bool verbose = false; bool ignore_next_option = false;
bool verbose = false;
static int oc; static int oc;
while((oc = getopt_long(argc, argv, "vhV", longopts, NULL)) != -1)
handle = uchardet_new();
while((oc = getopt_long(argc, argv, "vhVw:", longopts, NULL)) != -1)
{ {
switch (oc) switch (oc)
{ {
case 'v': case 'v':
show_version(); show_version();
uchardet_delete(handle);
return 0; return 0;
case 'h': case 'h':
show_usage(); show_usage();
uchardet_delete(handle);
return 0; return 0;
case 'V': case 'V':
verbose = true; verbose = true;
break; break;
case 'w':
{
char *lang_weight;
char *saveptr;
char *comma;
lang_weight = strtok_r (optarg, ",", &saveptr);
do
{
comma = strchr (lang_weight, ':');
if (! comma)
{
printf("-w format is lang1:weight1,lang2:weight2...\n");
uchardet_delete(handle);
return 1;
}
*comma = '\0';
uchardet_weigh_language(handle, lang_weight, strtof (comma + 1, NULL));
}
while ((lang_weight = strtok_r (NULL, ",", &saveptr)));
}
break;
case '?': case '?':
printf("Please use %s --help.\n", argv[0]); printf("Please use %s --help.\n", argv[0]);
uchardet_delete(handle);
return 1; return 1;
} }
} }
@ -157,15 +187,15 @@ int main(int argc, char ** argv)
(argc == 2 && strcmp(argv[1], "--") == 0)) (argc == 2 && strcmp(argv[1], "--") == 0))
{ {
// No file arg, use stdin by default // No file arg, use stdin by default
detect(f, verbose); detect(handle, f, verbose);
} }
for (int i = 1; i < argc; i++) for (int i = 1; i < argc; i++)
{ {
const char *filename = argv[i]; const char *filename = argv[i];
if (strcmp(filename, "-V") == 0 || if (ignore_next_option)
strcmp(filename, "--verbose") == 0)
{ {
ignore_next_option = false;
continue; continue;
} }
else if (! end_options && strcmp(filename, "--") == 0) else if (! end_options && strcmp(filename, "--") == 0)
@ -174,6 +204,33 @@ int main(int argc, char ** argv)
continue; continue;
} }
if (! end_options)
{
if (strcmp(filename, "-V") == 0 ||
strcmp(filename, "--verbose") == 0)
{
continue;
}
else if (strcmp(filename, "-w") == 0 ||
strcmp(filename, "--weight") == 0)
{
ignore_next_option = true;
continue;
}
else if (*filename == '-' &&
(*(filename + 1) == 'w' ||
(*(filename + 1) == '-' && *(filename + 2) == 'w')))
{
/* Some ugly trick to recognize -wlang:weight as well as
* --weight=lang:weight patterns.
* Obviously assuming that we have no other long option
* starting with 'w'. If we end up having one, this
* should be updated.
*/
continue;
}
}
f = fopen(filename, "r"); f = fopen(filename, "r");
if (f == NULL) if (f == NULL)
{ {
@ -185,8 +242,10 @@ int main(int argc, char ** argv)
{ {
printf("%s: ", filename); printf("%s: ", filename);
} }
detect(f, verbose); detect(handle, f, verbose);
} }
uchardet_delete(handle);
return error_seen; return error_seen;
} }