mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2026-01-01 03:12:24 +08:00
BuildLangModel: the max_depth should be a script option...
... rather than a language property.
This commit is contained in:
parent
274386f424
commit
00a78faa1d
@ -62,6 +62,10 @@ cmdline = optparse.OptionParser(usage, description = description)
|
||||
cmdline.add_option('--max-page',
|
||||
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
|
||||
action = 'store', type = 'int', dest = 'max_page', default = None)
|
||||
cmdline.add_option('--max-depth',
|
||||
help = 'Maximum depth when following links from start page (default: 2).',
|
||||
action = 'store', type = 'int',
|
||||
dest = 'max_depth', default = 2)
|
||||
(options, langs) = cmdline.parse_args()
|
||||
if len(langs) < 1:
|
||||
print("Please select at least one language code.\n")
|
||||
@ -96,8 +100,6 @@ if not hasattr(lang, 'start_page') or lang.start_page is None:
|
||||
lang.start_page = 'Main_Page'
|
||||
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
|
||||
lang.wikipedia_code = lang.code
|
||||
if not hasattr(lang, 'max_depth') or lang.max_depth is None:
|
||||
lang.max_depth = 2
|
||||
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
|
||||
lang.clean_wikipedia_content = None
|
||||
|
||||
@ -116,7 +118,7 @@ characters = {}
|
||||
sequences = {}
|
||||
prev_char = None
|
||||
|
||||
def visit_page(title, depth, max_depth, clean_text, logfd):
|
||||
def visit_page(title, depth, clean_text, logfd):
|
||||
global charsets
|
||||
global visited_pages
|
||||
global characters
|
||||
@ -178,22 +180,25 @@ def visit_page(title, depth, max_depth, clean_text, logfd):
|
||||
else:
|
||||
prev_char = None
|
||||
|
||||
if depth == max_depth:
|
||||
if depth == options.max_depth:
|
||||
return
|
||||
|
||||
for link in page.links:
|
||||
if link in visited_pages:
|
||||
continue
|
||||
visit_page (link, depth + 1, max_depth, clean_text, logfd)
|
||||
visit_page (link, depth + 1, clean_text, logfd)
|
||||
|
||||
logfd = open('LangFrenchModel.log', 'w')
|
||||
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
|
||||
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
|
||||
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
|
||||
logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
|
||||
if options.max_page is not None:
|
||||
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
|
||||
logfd.write('\n\n== Parsed pages ==\n')
|
||||
try:
|
||||
visit_page(lang.start_page, 0,
|
||||
lang.max_depth, lang.clean_wikipedia_content,
|
||||
lang.clean_wikipedia_content,
|
||||
logfd)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print('Error: connection to Wikipedia failed. Aborting\n')
|
||||
|
||||
@ -58,9 +58,6 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
|
||||
start_page = 'Wikipédia:Accueil_principal'
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# Let's go 2 pages deep from the main_page.
|
||||
# Since a page has a lot of links, this is actually quite a lot of contents.
|
||||
max_depth = 2
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user