From 82347030ba63ad64ccac442ef85f50661670c922 Mon Sep 17 00:00:00 2001 From: Jehan Date: Mon, 27 Apr 2020 18:14:34 +0200 Subject: [PATCH] src: add a --weight option to the CLI tool. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syntax is: lang1:weight1,lang2:weight2… For instance: `uchardet -wfr:1.1,it:1.05 file.txt` if you think a file is probably French or maybe Italian. --- src/tools/uchardet.cpp | 85 +++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 13 deletions(-) diff --git a/src/tools/uchardet.cpp b/src/tools/uchardet.cpp index eeeb18d..9d60a10 100644 --- a/src/tools/uchardet.cpp +++ b/src/tools/uchardet.cpp @@ -47,11 +47,10 @@ char buffer[BUFFER_SIZE]; -void detect(FILE *fp, - bool verbose) +void detect(uchardet_t handle, + FILE *fp, + bool verbose) { - uchardet_t handle = uchardet_new(); - while (!feof(fp)) { size_t len = fread(buffer, 1, BUFFER_SIZE, fp); @@ -91,8 +90,8 @@ void detect(FILE *fp, else printf("unknown\n"); } - - uchardet_delete(handle); + + uchardet_reset(handle); } void show_version() @@ -116,37 +115,68 @@ void show_usage() printf(" -v, --version Print version and build information.\n"); printf(" -h, --help Print this help.\n"); printf(" -V, --verbose Show all candidates and their confidence value.\n"); + printf(" -w, --weight Tweak language weights.\n"); printf("\n"); } int main(int argc, char ** argv) { + uchardet_t handle; static struct option longopts[] = { { "version", no_argument, NULL, 'v' }, { "help", no_argument, NULL, 'h' }, { "verbose", no_argument, NULL, 'V' }, + { "weight", required_argument, NULL, 'w' }, { 0, 0, 0, 0 }, }; - bool end_options = false; - bool verbose = false; + bool end_options = false; + bool ignore_next_option = false; + bool verbose = false; static int oc; - while((oc = getopt_long(argc, argv, "vhV", longopts, NULL)) != -1) + + handle = uchardet_new(); + while((oc = getopt_long(argc, argv, "vhVw:", longopts, NULL)) != -1) { switch (oc) { case 'v': show_version(); + uchardet_delete(handle); return 0; case 'h': show_usage(); + uchardet_delete(handle); return 0; case 'V': verbose = true; break; + case 'w': + { + char *lang_weight; + char *saveptr; + char *comma; + + lang_weight = strtok_r (optarg, ",", &saveptr); + do + { + comma = strchr (lang_weight, ':'); + if (! comma) + { + printf("-w format is lang1:weight1,lang2:weight2...\n"); + uchardet_delete(handle); + return 1; + } + *comma = '\0'; + uchardet_weigh_language(handle, lang_weight, strtof (comma + 1, NULL)); + } + while ((lang_weight = strtok_r (NULL, ",", &saveptr))); + } + break; case '?': printf("Please use %s --help.\n", argv[0]); + uchardet_delete(handle); return 1; } } @@ -157,15 +187,15 @@ int main(int argc, char ** argv) (argc == 2 && strcmp(argv[1], "--") == 0)) { // No file arg, use stdin by default - detect(f, verbose); + detect(handle, f, verbose); } for (int i = 1; i < argc; i++) { const char *filename = argv[i]; - if (strcmp(filename, "-V") == 0 || - strcmp(filename, "--verbose") == 0) + if (ignore_next_option) { + ignore_next_option = false; continue; } else if (! end_options && strcmp(filename, "--") == 0) @@ -174,6 +204,33 @@ int main(int argc, char ** argv) continue; } + if (! end_options) + { + if (strcmp(filename, "-V") == 0 || + strcmp(filename, "--verbose") == 0) + { + continue; + } + else if (strcmp(filename, "-w") == 0 || + strcmp(filename, "--weight") == 0) + { + ignore_next_option = true; + continue; + } + else if (*filename == '-' && + (*(filename + 1) == 'w' || + (*(filename + 1) == '-' && *(filename + 2) == 'w'))) + { + /* Some ugly trick to recognize -wlang:weight as well as + * --weight=lang:weight patterns. + * Obviously assuming that we have no other long option + * starting with 'w'. If we end up having one, this + * should be updated. + */ + continue; + } + } + f = fopen(filename, "r"); if (f == NULL) { @@ -185,8 +242,10 @@ int main(int argc, char ** argv) { printf("%s: ", filename); } - detect(f, verbose); + detect(handle, f, verbose); } + uchardet_delete(handle); + return error_seen; }