From b64831ff89764bff22e1e3c2c9cc9a257be9e9ce Mon Sep 17 00:00:00 2001 From: Jehan Date: Sun, 29 Nov 2015 15:51:23 +0100 Subject: [PATCH] BuildLangModel: allow a list of start pages... MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... and add a page with a word with œ in French to make sure we have such words in our stats. --- script/BuildLangModel.py | 16 +++++++++------- script/langs/fr.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index e388883..75bd663 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -90,14 +90,15 @@ sys.path = sys_path_backup charsets = charsets.db.load(lang.charsets) -if not hasattr(lang, 'start_page') or lang.start_page is None: +if not hasattr(lang, 'start_pages') or lang.start_pages is None or \ + lang.start_pages == []: # Let's start with the main page, assuming it should have links # to relevant pages. In locale wikipedia, this page is usually redirected # to a relevant page. - print("Warning: no `start_page` set for '{}'. Using 'Main_Page'.\n" + print("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n" " If you don't get good data, it is advised to set a " - "start_page` yourself.".format(lang.code)) - lang.start_page = 'Main_Page' + "start_pages` variable yourself.".format(lang.code)) + lang.start_pages = ['Main_Page'] if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None: lang.wikipedia_code = lang.code if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None: @@ -206,9 +207,10 @@ if options.max_page is not None: logfd.write('\n- Max number of pages: {}'.format(options.max_page)) logfd.write('\n\n== Parsed pages ==\n') try: - visit_page(lang.start_page, 0, - lang.clean_wikipedia_content, - logfd) + for title in lang.start_pages: + visit_page(title, 0, + lang.clean_wikipedia_content, + logfd) except requests.exceptions.ConnectionError: print('Error: connection to Wikipedia failed. Aborting\n') exit(1) diff --git a/script/langs/fr.py b/script/langs/fr.py index 602d96c..65bd3c6 100644 --- a/script/langs/fr.py +++ b/script/langs/fr.py @@ -55,7 +55,7 @@ charsets = ['ISO-8859-15', 'ISO-8859-1'] ## Optional Properties ## # The start page. Though optional, it is advised to choose one yourself. -start_page = 'Wikipédia:Accueil_principal' +start_pages = ['Wikipédia:Accueil_principal', 'Bœuf_(animal)'] # give possibility to select another code for the Wikipedia URL. wikipedia_code = code # 'a' and 'A' will be considered the same character, and so on.