diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index 8382b3e..5fb3a9e 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -62,6 +62,10 @@ cmdline = optparse.OptionParser(usage, description = description) cmdline.add_option('--max-page', help = 'Maximum number of Wikipedia pages to parse (useful for debugging).', action = 'store', type = 'int', dest = 'max_page', default = None) +cmdline.add_option('--max-depth', + help = 'Maximum depth when following links from start page (default: 2).', + action = 'store', type = 'int', + dest = 'max_depth', default = 2) (options, langs) = cmdline.parse_args() if len(langs) < 1: print("Please select at least one language code.\n") @@ -96,8 +100,6 @@ if not hasattr(lang, 'start_page') or lang.start_page is None: lang.start_page = 'Main_Page' if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None: lang.wikipedia_code = lang.code -if not hasattr(lang, 'max_depth') or lang.max_depth is None: - lang.max_depth = 2 if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None: lang.clean_wikipedia_content = None @@ -116,7 +118,7 @@ characters = {} sequences = {} prev_char = None -def visit_page(title, depth, max_depth, clean_text, logfd): +def visit_page(title, depth, clean_text, logfd): global charsets global visited_pages global characters @@ -178,22 +180,25 @@ def visit_page(title, depth, max_depth, clean_text, logfd): else: prev_char = None - if depth == max_depth: + if depth == options.max_depth: return for link in page.links: if link in visited_pages: continue - visit_page (link, depth + 1, max_depth, clean_text, logfd) + visit_page (link, depth + 1, clean_text, logfd) logfd = open('LangFrenchModel.log', 'w') logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code)) logfd.write('\n- Generated by {}'.format(os.path.basename(__file__))) logfd.write('\n- Started: {}'.format(str(datetime.datetime.now()))) +logfd.write('\n- Maximum depth: {}'.format(options.max_depth)) +if options.max_page is not None: + logfd.write('\n- Max number of pages: {}'.format(options.max_page)) logfd.write('\n\n== Parsed pages ==\n') try: visit_page(lang.start_page, 0, - lang.max_depth, lang.clean_wikipedia_content, + lang.clean_wikipedia_content, logfd) except requests.exceptions.ConnectionError: print('Error: connection to Wikipedia failed. Aborting\n') diff --git a/script/langs/fr.py b/script/langs/fr.py index 64dd1a4..f2951a2 100644 --- a/script/langs/fr.py +++ b/script/langs/fr.py @@ -58,9 +58,6 @@ charsets = ['ISO-8859-15', 'ISO-8859-1'] start_page = 'Wikipédia:Accueil_principal' # give possibility to select another code for the Wikipedia URL. wikipedia_code = code -# Let's go 2 pages deep from the main_page. -# Since a page has a lot of links, this is actually quite a lot of contents. -max_depth = 2 # A function to clean content returned by the `wikipedia` python lib, # in case some unwanted data has been overlooked.