mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2026-02-09 03:06:52 +08:00
BuildLangModel: the max_depth should be a script option...
... rather than a language property.
This commit is contained in:
parent
274386f424
commit
00a78faa1d
@ -62,6 +62,10 @@ cmdline = optparse.OptionParser(usage, description = description)
|
|||||||
cmdline.add_option('--max-page',
|
cmdline.add_option('--max-page',
|
||||||
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
|
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
|
||||||
action = 'store', type = 'int', dest = 'max_page', default = None)
|
action = 'store', type = 'int', dest = 'max_page', default = None)
|
||||||
|
cmdline.add_option('--max-depth',
|
||||||
|
help = 'Maximum depth when following links from start page (default: 2).',
|
||||||
|
action = 'store', type = 'int',
|
||||||
|
dest = 'max_depth', default = 2)
|
||||||
(options, langs) = cmdline.parse_args()
|
(options, langs) = cmdline.parse_args()
|
||||||
if len(langs) < 1:
|
if len(langs) < 1:
|
||||||
print("Please select at least one language code.\n")
|
print("Please select at least one language code.\n")
|
||||||
@ -96,8 +100,6 @@ if not hasattr(lang, 'start_page') or lang.start_page is None:
|
|||||||
lang.start_page = 'Main_Page'
|
lang.start_page = 'Main_Page'
|
||||||
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
|
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
|
||||||
lang.wikipedia_code = lang.code
|
lang.wikipedia_code = lang.code
|
||||||
if not hasattr(lang, 'max_depth') or lang.max_depth is None:
|
|
||||||
lang.max_depth = 2
|
|
||||||
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
|
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
|
||||||
lang.clean_wikipedia_content = None
|
lang.clean_wikipedia_content = None
|
||||||
|
|
||||||
@ -116,7 +118,7 @@ characters = {}
|
|||||||
sequences = {}
|
sequences = {}
|
||||||
prev_char = None
|
prev_char = None
|
||||||
|
|
||||||
def visit_page(title, depth, max_depth, clean_text, logfd):
|
def visit_page(title, depth, clean_text, logfd):
|
||||||
global charsets
|
global charsets
|
||||||
global visited_pages
|
global visited_pages
|
||||||
global characters
|
global characters
|
||||||
@ -178,22 +180,25 @@ def visit_page(title, depth, max_depth, clean_text, logfd):
|
|||||||
else:
|
else:
|
||||||
prev_char = None
|
prev_char = None
|
||||||
|
|
||||||
if depth == max_depth:
|
if depth == options.max_depth:
|
||||||
return
|
return
|
||||||
|
|
||||||
for link in page.links:
|
for link in page.links:
|
||||||
if link in visited_pages:
|
if link in visited_pages:
|
||||||
continue
|
continue
|
||||||
visit_page (link, depth + 1, max_depth, clean_text, logfd)
|
visit_page (link, depth + 1, clean_text, logfd)
|
||||||
|
|
||||||
logfd = open('LangFrenchModel.log', 'w')
|
logfd = open('LangFrenchModel.log', 'w')
|
||||||
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
|
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
|
||||||
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
|
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
|
||||||
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
|
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
|
||||||
|
logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
|
||||||
|
if options.max_page is not None:
|
||||||
|
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
|
||||||
logfd.write('\n\n== Parsed pages ==\n')
|
logfd.write('\n\n== Parsed pages ==\n')
|
||||||
try:
|
try:
|
||||||
visit_page(lang.start_page, 0,
|
visit_page(lang.start_page, 0,
|
||||||
lang.max_depth, lang.clean_wikipedia_content,
|
lang.clean_wikipedia_content,
|
||||||
logfd)
|
logfd)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
print('Error: connection to Wikipedia failed. Aborting\n')
|
print('Error: connection to Wikipedia failed. Aborting\n')
|
||||||
|
|||||||
@ -58,9 +58,6 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
|
|||||||
start_page = 'Wikipédia:Accueil_principal'
|
start_page = 'Wikipédia:Accueil_principal'
|
||||||
# give possibility to select another code for the Wikipedia URL.
|
# give possibility to select another code for the Wikipedia URL.
|
||||||
wikipedia_code = code
|
wikipedia_code = code
|
||||||
# Let's go 2 pages deep from the main_page.
|
|
||||||
# Since a page has a lot of links, this is actually quite a lot of contents.
|
|
||||||
max_depth = 2
|
|
||||||
|
|
||||||
# A function to clean content returned by the `wikipedia` python lib,
|
# A function to clean content returned by the `wikipedia` python lib,
|
||||||
# in case some unwanted data has been overlooked.
|
# in case some unwanted data has been overlooked.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user