mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2026-02-14 14:20:00 +08:00
improve model building script a bit
This commit is contained in:
parent
8d15d6b557
commit
e41e8a47e4
@ -50,6 +50,7 @@ import requests
|
|||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
# Custom modules.
|
# Custom modules.
|
||||||
import charsets.db
|
import charsets.db
|
||||||
@ -240,12 +241,19 @@ def visit_pages(titles, depth, lang, logfd):
|
|||||||
return
|
return
|
||||||
|
|
||||||
next_titles = []
|
next_titles = []
|
||||||
|
max_titles = int(options.max_page/(options.max_depth * options.max_depth))
|
||||||
for title in titles:
|
for title in titles:
|
||||||
if options.max_page is not None and \
|
if options.max_page is not None and \
|
||||||
len(visited_pages) > options.max_page:
|
len(visited_pages) > options.max_page:
|
||||||
return
|
return
|
||||||
if title in visited_pages:
|
if title in visited_pages:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Ugly hack skipping internal pages
|
||||||
|
if 'wiki' in title or 'Wiki' in title:
|
||||||
|
print('Skipping', title)
|
||||||
|
continue
|
||||||
|
|
||||||
visited_pages += [title]
|
visited_pages += [title]
|
||||||
try:
|
try:
|
||||||
page = wikipedia.page(title)
|
page = wikipedia.page(title)
|
||||||
@ -258,14 +266,19 @@ def visit_pages(titles, depth, lang, logfd):
|
|||||||
logfd.flush()
|
logfd.flush()
|
||||||
|
|
||||||
process_text(page.content, lang)
|
process_text(page.content, lang)
|
||||||
|
links = page.links
|
||||||
|
random.shuffle(links)
|
||||||
|
if len(links) > max_titles:
|
||||||
|
links = links[:max_titles]
|
||||||
try:
|
try:
|
||||||
next_titles += page.links
|
next_titles += links
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if depth >= options.max_depth:
|
if depth >= options.max_depth:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
random.shuffle(next_titles)
|
||||||
visit_pages (next_titles, depth + 1, lang, logfd)
|
visit_pages (next_titles, depth + 1, lang, logfd)
|
||||||
|
|
||||||
language_c = lang.name.replace('-', '_').title()
|
language_c = lang.name.replace('-', '_').title()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user