BuildLangModel: the max_depth should be a script option...

... rather than a language property.
This commit is contained in:
Jehan 2015-11-29 01:59:28 +01:00
parent 274386f424
commit 00a78faa1d
2 changed files with 11 additions and 9 deletions

View File

@ -62,6 +62,10 @@ cmdline = optparse.OptionParser(usage, description = description)
cmdline.add_option('--max-page',
help = 'Maximum number of Wikipedia pages to parse (useful for debugging).',
action = 'store', type = 'int', dest = 'max_page', default = None)
cmdline.add_option('--max-depth',
help = 'Maximum depth when following links from start page (default: 2).',
action = 'store', type = 'int',
dest = 'max_depth', default = 2)
(options, langs) = cmdline.parse_args()
if len(langs) < 1:
print("Please select at least one language code.\n")
@ -96,8 +100,6 @@ if not hasattr(lang, 'start_page') or lang.start_page is None:
lang.start_page = 'Main_Page'
if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
lang.wikipedia_code = lang.code
if not hasattr(lang, 'max_depth') or lang.max_depth is None:
lang.max_depth = 2
if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
lang.clean_wikipedia_content = None
@ -116,7 +118,7 @@ characters = {}
sequences = {}
prev_char = None
def visit_page(title, depth, max_depth, clean_text, logfd):
def visit_page(title, depth, clean_text, logfd):
global charsets
global visited_pages
global characters
@ -178,22 +180,25 @@ def visit_page(title, depth, max_depth, clean_text, logfd):
else:
prev_char = None
if depth == max_depth:
if depth == options.max_depth:
return
for link in page.links:
if link in visited_pages:
continue
visit_page (link, depth + 1, max_depth, clean_text, logfd)
visit_page (link, depth + 1, clean_text, logfd)
logfd = open('LangFrenchModel.log', 'w')
logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
if options.max_page is not None:
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
logfd.write('\n\n== Parsed pages ==\n')
try:
visit_page(lang.start_page, 0,
lang.max_depth, lang.clean_wikipedia_content,
lang.clean_wikipedia_content,
logfd)
except requests.exceptions.ConnectionError:
print('Error: connection to Wikipedia failed. Aborting\n')

View File

@ -58,9 +58,6 @@ charsets = ['ISO-8859-15', 'ISO-8859-1']
start_page = 'Wikipédia:Accueil_principal'
# give possibility to select another code for the Wikipedia URL.
wikipedia_code = code
# Let's go 2 pages deep from the main_page.
# Since a page has a lot of links, this is actually quite a lot of contents.
max_depth = 2
# A function to clean content returned by the `wikipedia` python lib,
# in case some unwanted data has been overlooked.