mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-08 01:36:41 +08:00
BuildLangModel: add a --max-page option to limit data size.
This is mostly useful for debugging while we don't want to wait forever to test the script.
This commit is contained in:
parent
0314f98ece
commit
274386f424
@ -59,6 +59,9 @@ usage = 'Usage: gen-lang-data <LANG-CODE>\n' \
|
||||
|
||||
description = "Internal tool for uchardet to generate language data."
|
||||
cmdline = optparse.OptionParser(usage, description = description)
|
||||
cmdline.add_option('--max-page',
|
||||
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
|
||||
action = 'store', type = 'int', dest = 'max_page', default = None)
|
||||
(options, langs) = cmdline.parse_args()
|
||||
if len(langs) < 1:
|
||||
print("Please select at least one language code.\n")
|
||||
@ -119,6 +122,11 @@ def visit_page(title, depth, max_depth, clean_text, logfd):
|
||||
global characters
|
||||
global sequences
|
||||
global prev_char
|
||||
global options
|
||||
|
||||
if options.max_page is not None and \
|
||||
len(visited_pages) > options.max_page:
|
||||
return
|
||||
|
||||
visited_pages += [title]
|
||||
try:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user