BuildLangModel: add a --max-page option to limit data size.

This is mostly useful for debugging while we don't want to wait forever
to test the script.
This commit is contained in:
Jehan 2015-11-29 01:42:36 +01:00
parent 0314f98ece
commit 274386f424

View File

@ -59,6 +59,9 @@ usage = 'Usage: gen-lang-data <LANG-CODE>\n' \
description = "Internal tool for uchardet to generate language data."
cmdline = optparse.OptionParser(usage, description = description)
cmdline.add_option('--max-page',
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
action = 'store', type = 'int', dest = 'max_page', default = None)
(options, langs) = cmdline.parse_args()
if len(langs) < 1:
print("Please select at least one language code.\n")
@ -119,6 +122,11 @@ def visit_page(title, depth, max_depth, clean_text, logfd):
global characters
global sequences
global prev_char
global options
if options.max_page is not None and \
len(visited_pages) > options.max_page:
return
visited_pages += [title]
try: