improve model building script a bit

This commit is contained in:
Martin T. H. Sandsmark 2022-01-28 21:59:31 +01:00 committed by Jehan
parent 8d15d6b557
commit e41e8a47e4

View File

@ -50,6 +50,7 @@ import requests
import sys import sys
import re import re
import os import os
import random
# Custom modules. # Custom modules.
import charsets.db import charsets.db
@ -240,12 +241,19 @@ def visit_pages(titles, depth, lang, logfd):
return return
next_titles = [] next_titles = []
max_titles = int(options.max_page/(options.max_depth * options.max_depth))
for title in titles: for title in titles:
if options.max_page is not None and \ if options.max_page is not None and \
len(visited_pages) > options.max_page: len(visited_pages) > options.max_page:
return return
if title in visited_pages: if title in visited_pages:
continue continue
# Ugly hack skipping internal pages
if 'wiki' in title or 'Wiki' in title:
print('Skipping', title)
continue
visited_pages += [title] visited_pages += [title]
try: try:
page = wikipedia.page(title) page = wikipedia.page(title)
@ -258,14 +266,19 @@ def visit_pages(titles, depth, lang, logfd):
logfd.flush() logfd.flush()
process_text(page.content, lang) process_text(page.content, lang)
links = page.links
random.shuffle(links)
if len(links) > max_titles:
links = links[:max_titles]
try: try:
next_titles += page.links next_titles += links
except KeyError: except KeyError:
pass pass
if depth >= options.max_depth: if depth >= options.max_depth:
return return
random.shuffle(next_titles)
visit_pages (next_titles, depth + 1, lang, logfd) visit_pages (next_titles, depth + 1, lang, logfd)
language_c = lang.name.replace('-', '_').title() language_c = lang.name.replace('-', '_').title()