mirror of
https://gitlab.freedesktop.org/uchardet/uchardet.git
synced 2025-12-06 16:56:40 +08:00
Compare commits
284 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
06029ec334 | ||
|
|
9699dfce07 | ||
|
|
dff8906402 | ||
|
|
6e163c978a | ||
|
|
edae8e81cf | ||
|
|
b95252ff0c | ||
|
|
ab1d2f1120 | ||
|
|
9910941387 | ||
|
|
8fe0b2e080 | ||
|
|
bc93da89d9 | ||
|
|
bd983ca108 | ||
|
|
bdd71d88f8 | ||
|
|
7875272a8c | ||
|
|
c843d23a17 | ||
|
|
419a971e6a | ||
|
|
d40e5868d5 | ||
|
|
cec8817d79 | ||
|
|
0fe51d3851 | ||
|
|
a82139b3bd | ||
|
|
d4ef245fdc | ||
|
|
db836fad63 | ||
|
|
d6cab28fb4 | ||
|
|
abd123e07d | ||
|
|
d00d4d52b7 | ||
|
|
41d309e8a2 | ||
|
|
60dcec8a82 | ||
|
|
0fffc109b5 | ||
|
|
ffb94e4a9d | ||
|
|
5e25e93da7 | ||
|
|
6d31689632 | ||
|
|
0974920bdd | ||
|
|
127d7faf47 | ||
|
|
3a6806ab19 | ||
|
|
e6e51d9fe8 | ||
|
|
362086bf56 | ||
|
|
598fe90c91 | ||
|
|
6bb1b3e101 | ||
|
|
e311b64cd9 | ||
|
|
401eb55dfc | ||
|
|
4f35cd4416 | ||
|
|
7f386d922e | ||
|
|
fb433a57b5 | ||
|
|
908f9b8ba7 | ||
|
|
a916fb1c56 | ||
|
|
baeefc0958 | ||
|
|
b5b75b81ce | ||
|
|
0be80a21db | ||
|
|
784f614c84 | ||
|
|
6365cad4fd | ||
|
|
81b83fffa9 | ||
|
|
a3ff09bece | ||
|
|
c9446e540d | ||
|
|
bfa4b10d4d | ||
|
|
bed459c6e7 | ||
|
|
bffb7819d2 | ||
|
|
5cf3c648fb | ||
|
|
d6c5c26150 | ||
|
|
6436e1dd47 | ||
|
|
8e2cf7b81b | ||
|
|
314f062c70 | ||
|
|
41fec68674 | ||
|
|
338a51564a | ||
|
|
ba7d72e3b0 | ||
|
|
adb158b058 | ||
|
|
19737886fe | ||
|
|
9d29c3e26f | ||
|
|
b7acffc806 | ||
|
|
b725c0b2ff | ||
|
|
c782177a8d | ||
|
|
3ca49e2bc1 | ||
|
|
8113f604de | ||
|
|
a1b186fa8b | ||
|
|
9736950227 | ||
|
|
a98cdcd88f | ||
|
|
629bc879f3 | ||
|
|
0d152ff430 | ||
|
|
3996b9d648 | ||
|
|
d72a5c88ce | ||
|
|
ded948ce15 | ||
|
|
cf0ffb0c55 | ||
|
|
a7c5a167a9 | ||
|
|
b00c85a6a6 | ||
|
|
2a16ab2310 | ||
|
|
6138d9e0f0 | ||
|
|
2127f4fc0d | ||
|
|
ea32980273 | ||
|
|
25d2890676 | ||
|
|
1b5e68be00 | ||
|
|
82c1d2b25e | ||
|
|
eb8308d50a | ||
|
|
5257fc1abf | ||
|
|
dac7cbd30f | ||
|
|
b70b1ebf88 | ||
|
|
a0bfba3db3 | ||
|
|
669ede73a3 | ||
|
|
f74d602449 | ||
|
|
d48ee7abc2 | ||
|
|
c550af99a7 | ||
|
|
5a949265d5 | ||
|
|
e7bf25ca08 | ||
|
|
7bc1bc4e0a | ||
|
|
8118133e00 | ||
|
|
15fc8f0a0f | ||
|
|
2f5c24006e | ||
|
|
ae6302a016 | ||
|
|
c218a3ccd6 | ||
|
|
6196f86c46 | ||
|
|
388777be51 | ||
|
|
5aa628272b | ||
|
|
c11c362b89 | ||
|
|
099a9a4fd6 | ||
|
|
e41e8a47e4 | ||
|
|
8d15d6b557 | ||
|
|
2a04e57c8f | ||
|
|
45bd32d102 | ||
|
|
ef19faa8c5 | ||
|
|
383bf118c9 | ||
|
|
143b3fe513 | ||
|
|
23a664560b | ||
|
|
b3b2bd2721 | ||
|
|
48db2b0800 | ||
|
|
d7dad549bd | ||
|
|
6f38ab95f5 | ||
|
|
c8a3572cca | ||
|
|
472a906844 | ||
|
|
8681fc060e | ||
|
|
5bcbd23acf | ||
|
|
a49f8ef6ea | ||
|
|
59f68dbe57 | ||
|
|
98bc2f31ef | ||
|
|
44a50c30ee | ||
|
|
6c7f32a751 | ||
|
|
ef0313046b | ||
|
|
4a37dfdf1c | ||
|
|
ae7acbd0f2 | ||
|
|
2694ba6363 | ||
|
|
81ab1d1da1 | ||
|
|
6afec53adc | ||
|
|
b5674dbd50 | ||
|
|
e0b9269849 | ||
|
|
60bf53c81e | ||
|
|
0cfb75724a | ||
|
|
bdfd6116a9 | ||
|
|
f136d434f0 | ||
|
|
95872ef41c | ||
|
|
df67ae4fe0 | ||
|
|
cd617d181d | ||
|
|
939482ab2b | ||
|
|
77bf71ea36 | ||
|
|
5996bbd995 | ||
|
|
056a5a6e51 | ||
|
|
1898847eb6 | ||
|
|
170ef349cf | ||
|
|
c049332c41 | ||
|
|
d9d014742a | ||
|
|
53f7ad0e0b | ||
|
|
50bc02c0ff | ||
|
|
1bf198cb0f | ||
|
|
98bf4d73fd | ||
|
|
50743e16f8 | ||
|
|
6cf13f108b | ||
|
|
94b10b9b29 | ||
|
|
64efb1b24c | ||
|
|
56b843522b | ||
|
|
d90d01bc9e | ||
|
|
119fed7e8d | ||
|
|
d62154bd6e | ||
|
|
fbd2efdbe9 | ||
|
|
0a04177787 | ||
|
|
a7525b404d | ||
|
|
a3a271dfd5 | ||
|
|
3c6d31f5c2 | ||
|
|
d76d33b88b | ||
|
|
05ba8555cd | ||
|
|
4e535503c6 | ||
|
|
f262b1d65b | ||
|
|
87d0c16e0e | ||
|
|
6bbe7da1ac | ||
|
|
ac4aa94b73 | ||
|
|
a59b1c9571 | ||
|
|
3401ac70d0 | ||
|
|
f314b76c0a | ||
|
|
5f9ec3aef0 | ||
|
|
5680cba0b8 | ||
|
|
2c752dbbe5 | ||
|
|
26e1cebad1 | ||
|
|
183092d048 | ||
|
|
26024e5c82 | ||
|
|
2700cf3a83 | ||
|
|
be01360e92 | ||
|
|
d810f1175b | ||
|
|
b7aebfdfda | ||
|
|
9f7ed67166 | ||
|
|
e138839f07 | ||
|
|
e98d257ec4 | ||
|
|
ea2f4dd40f | ||
|
|
7cb3dd9ddd | ||
|
|
2a559e7b52 | ||
|
|
157de1dc65 | ||
|
|
f14519a0fe | ||
|
|
8a8d6b654c | ||
|
|
771d78b7df | ||
|
|
20eb319359 | ||
|
|
602c1ab0fc | ||
|
|
210e52d99a | ||
|
|
e0eec3bae8 | ||
|
|
4287d3accc | ||
|
|
6cd8c322ad | ||
|
|
fb1d544007 | ||
|
|
74b4f6a62b | ||
|
|
2a3e41a6c3 | ||
|
|
6db8b6f8fe | ||
|
|
d0e7ddd8ab | ||
|
|
dbeee08335 | ||
|
|
ad647d2e0a | ||
|
|
29f18210b1 | ||
|
|
7201835c98 | ||
|
|
e7feb35627 | ||
|
|
1a1f4bfbd8 | ||
|
|
31a53570d6 | ||
|
|
d0e29dc934 | ||
|
|
ad7db2769e | ||
|
|
b44be77be6 | ||
|
|
b88a66f3f1 | ||
|
|
e28dfe3776 | ||
|
|
78b55ec9fe | ||
|
|
6c1e310f9b | ||
|
|
fcc525a64f | ||
|
|
d255184609 | ||
|
|
86755b1f57 | ||
|
|
b908b689a0 | ||
|
|
81ed86a26b | ||
|
|
aa4c2aeada | ||
|
|
50b2e0802f | ||
|
|
6500f09931 | ||
|
|
f53cb8cddd | ||
|
|
36665da832 | ||
|
|
198190461e | ||
|
|
d24bd7d578 | ||
|
|
37024460fe | ||
|
|
42c6b42f65 | ||
|
|
d5dba26e04 | ||
|
|
923d264470 | ||
|
|
1694999bce | ||
|
|
98b5e52252 | ||
|
|
600cf76a76 | ||
|
|
178c6119b8 | ||
|
|
27135a8880 | ||
|
|
0446e24c8d | ||
|
|
248d6dbd35 | ||
|
|
b6d872bbec | ||
|
|
706023139c | ||
|
|
9c3c37517c | ||
|
|
ad2f7212e2 | ||
|
|
1b4c62ac21 | ||
|
|
ffabb65712 | ||
|
|
055332ac7d | ||
|
|
6b2722885a | ||
|
|
2bade77bf9 | ||
|
|
a251753db8 | ||
|
|
7b4eb9827e | ||
|
|
4c8316f9cf | ||
|
|
886e03a523 | ||
|
|
fe7bf3e994 | ||
|
|
e5234d6b61 | ||
|
|
2856e68aac | ||
|
|
5691dc59a1 | ||
|
|
569509f844 | ||
|
|
dc03ea002f | ||
|
|
fb3c47a073 | ||
|
|
ffcd85f709 | ||
|
|
5ee1c3ee39 | ||
|
|
22b9ed2d4f | ||
|
|
f0e122b506 | ||
|
|
a167bd5e42 | ||
|
|
b56a3c7b84 | ||
|
|
55b4f23971 | ||
|
|
aa587a64bd | ||
|
|
90728e4068 | ||
|
|
0270b1e856 | ||
|
|
5d3fb3dc2f | ||
|
|
15afc5c593 | ||
|
|
ea34e8b1bd | ||
|
|
60f641bf37 |
37
.gitignore
vendored
37
.gitignore
vendored
@ -1 +1,38 @@
|
||||
__pycache__/
|
||||
|
||||
# CMake files
|
||||
CMakeCache.txt
|
||||
CMakeFiles/
|
||||
CTestTestfile.cmake
|
||||
cmake_install.cmake
|
||||
|
||||
# With make generator
|
||||
Makefile
|
||||
|
||||
# With ninja generator
|
||||
.ninja_deps
|
||||
.ninja_log
|
||||
build.ninja
|
||||
|
||||
# Built files
|
||||
uchardet-config-version.cmake
|
||||
uchardet-config.cmake
|
||||
uchardet-targets.cmake
|
||||
uchardet.pc
|
||||
src/version.script
|
||||
|
||||
# Build binaries
|
||||
src/libuchardet.a
|
||||
src/libuchardet.so*
|
||||
|
||||
src/tools/uchardet
|
||||
test/uchardet-tests
|
||||
|
||||
# For Windows (untested)
|
||||
src/libuchardet.dll
|
||||
|
||||
src/tools/uchardet.exe
|
||||
test/uchardet-tests.exe
|
||||
|
||||
# For macOS (untested)
|
||||
src/libuchardet.dylib
|
||||
|
||||
113
.gitlab-ci.yml
Normal file
113
.gitlab-ci.yml
Normal file
@ -0,0 +1,113 @@
|
||||
image: debian:testing
|
||||
|
||||
stages:
|
||||
- build
|
||||
|
||||
variables:
|
||||
GIT_DEPTH: "1"
|
||||
|
||||
# New rule necessary to allow CI to run for merge requests of external contributors.
|
||||
# See: https://gitlab.freedesktop.org/freedesktop/freedesktop/-/issues/540
|
||||
workflow:
|
||||
rules:
|
||||
- if: $CI_PIPELINE_SOURCE == 'merge_request_event'
|
||||
- if: $CI_PIPELINE_SOURCE == 'push'
|
||||
|
||||
## GNU/Linux 64-bit CIs ##
|
||||
|
||||
debian/testing-gcc:
|
||||
stage: build
|
||||
artifacts:
|
||||
expire_in: 1 week
|
||||
when: always
|
||||
name: "uchardet-build-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}"
|
||||
paths:
|
||||
- _build
|
||||
before_script:
|
||||
- apt-get update
|
||||
- apt-get install -y --no-install-recommends
|
||||
build-essential
|
||||
cmake
|
||||
script:
|
||||
- mkdir _build
|
||||
- cd _build
|
||||
- cmake ..
|
||||
- make -j "$(nproc)"
|
||||
- make test
|
||||
- make install
|
||||
|
||||
debian/testing-clang:
|
||||
extends: debian/testing-gcc
|
||||
variables:
|
||||
CC: "clang"
|
||||
CXX: "clang++"
|
||||
before_script:
|
||||
- apt-get update
|
||||
- apt-get install -y --no-install-recommends
|
||||
build-essential
|
||||
clang
|
||||
cmake
|
||||
|
||||
## Windows CIs ##
|
||||
|
||||
win64:
|
||||
stage: build
|
||||
artifacts:
|
||||
expire_in: 1 week
|
||||
when: always
|
||||
name: "uchardet-build-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}"
|
||||
paths:
|
||||
- _build
|
||||
before_script:
|
||||
- apt-get update
|
||||
- apt-get install -y --no-install-recommends
|
||||
build-essential
|
||||
cmake
|
||||
cpio
|
||||
gcc-mingw-w64-x86-64
|
||||
g++-mingw-w64-x86-64
|
||||
git
|
||||
python3-distutils
|
||||
python3-docutils
|
||||
rpm
|
||||
- apt-get install -y --reinstall ca-certificates
|
||||
- git clone --depth=${GIT_DEPTH} git://git.tuxfamily.org/gitroot/crossroad/crossroad.git
|
||||
- cd crossroad
|
||||
- ./setup.py install --prefix=`pwd`/../.local
|
||||
- cd ..
|
||||
script:
|
||||
- export PATH="`pwd`/.local/bin:$PATH"
|
||||
- mkdir _build
|
||||
- cd _build
|
||||
- echo 'crossroad cmake .. && make && make install' | crossroad w64 gimp --run="-"
|
||||
|
||||
win32:
|
||||
stage: build
|
||||
artifacts:
|
||||
expire_in: 1 week
|
||||
when: always
|
||||
name: "uchardet-build-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}"
|
||||
paths:
|
||||
- _build
|
||||
before_script:
|
||||
- apt-get update
|
||||
- apt-get install -y --no-install-recommends
|
||||
build-essential
|
||||
cmake
|
||||
cpio
|
||||
gcc-mingw-w64-i686
|
||||
g++-mingw-w64-i686
|
||||
git
|
||||
python3-distutils
|
||||
python3-docutils
|
||||
rpm
|
||||
- apt-get install -y --reinstall ca-certificates
|
||||
- git clone --depth=${GIT_DEPTH} git://git.tuxfamily.org/gitroot/crossroad/crossroad.git
|
||||
- cd crossroad
|
||||
- ./setup.py install --prefix=`pwd`/../.local
|
||||
- cd ..
|
||||
script:
|
||||
- export PATH="`pwd`/.local/bin:$PATH"
|
||||
- mkdir _build
|
||||
- cd _build
|
||||
- echo 'crossroad cmake .. && make && make install' | crossroad w32 gimp --run="-"
|
||||
118
CMakeLists.txt
118
CMakeLists.txt
@ -1,66 +1,60 @@
|
||||
######## Project settings
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
set (PACKAGE_NAME opencc)
|
||||
cmake_minimum_required(VERSION 3.5)
|
||||
include(CheckCCompilerFlag)
|
||||
set (PACKAGE_NAME uchardet)
|
||||
project (${PACKAGE_NAME} CXX C)
|
||||
enable_testing()
|
||||
|
||||
######## Package information
|
||||
set (PACKAGE_URL https://github.com/BYVoid/uchardet)
|
||||
set (PACKAGE_BUGREPORT https://github.com/BYVoid/uchardet/issues)
|
||||
set (PACKAGE_URL https://www.freedesktop.org/wiki/Software/uchardet/)
|
||||
set (PACKAGE_BUGREPORT https://gitlab.freedesktop.org/uchardet/uchardet/-/issues)
|
||||
set (UCHARDET_VERSION_MAJOR 0)
|
||||
set (UCHARDET_VERSION_MINOR 0)
|
||||
set (UCHARDET_VERSION_REVISION 4)
|
||||
set (UCHARDET_VERSION_REVISION 8)
|
||||
|
||||
if (CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
set (version_suffix .Debug)
|
||||
set(version_suffix .debug)
|
||||
add_compile_options("-fsanitize=address")
|
||||
add_link_options("-fsanitize=address")
|
||||
endif (CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
|
||||
set (
|
||||
UCHARDET_VERSION
|
||||
${UCHARDET_VERSION_MAJOR}.${UCHARDET_VERSION_MINOR}.${UCHARDET_VERSION_REVISION}${version_suffix}
|
||||
)
|
||||
|
||||
######## Windows
|
||||
|
||||
#if (WIN32)
|
||||
# set(CMAKE_SHARED_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX})
|
||||
# set(CMAKE_STATIC_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX})
|
||||
#endif (WIN32)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
######## Directory
|
||||
|
||||
set (DIR_PREFIX ${CMAKE_INSTALL_PREFIX})
|
||||
set (DIR_LIBRARY ${DIR_PREFIX}/${CMAKE_SHARED_LIBRARY_PREFIX})
|
||||
set (DIR_LIBRARY_STATIC ${DIR_PREFIX}/${CMAKE_STATIC_LIBRARY_PREFIX})
|
||||
set (DIR_INCLUDE ${DIR_PREFIX}/include)
|
||||
set (DIR_SHARE ${DIR_PREFIX}/share)
|
||||
set (DIR_BIN ${DIR_PREFIX}/bin)
|
||||
set (DIR_ETC ${DIR_PREFIX}/etc)
|
||||
|
||||
if (DEFINED CMAKE_INSTALL_LIBDIR)
|
||||
set (DIR_LIBRARY ${CMAKE_INSTALL_LIBDIR})
|
||||
set (DIR_LIBRARY_STATIC ${CMAKE_INSTALL_LIBDIR})
|
||||
endif (DEFINED CMAKE_INSTALL_LIBDIR)
|
||||
|
||||
if (DEFINED SHARE_INSTALL_PREFIX)
|
||||
set (DIR_SHARE ${SHARE_INSTALL_PREFIX})
|
||||
endif (DEFINED SHARE_INSTALL_PREFIX)
|
||||
|
||||
if (DEFINED INCLUDE_INSTALL_DIR)
|
||||
set (DIR_INCLUDE ${INCLUDE_INSTALL_DIR})
|
||||
endif (DEFINED INCLUDE_INSTALL_DIR)
|
||||
|
||||
if (DEFINED SYSCONF_INSTALL_DIR)
|
||||
set (DIR_ETC ${SYSCONF_INSTALL_DIR})
|
||||
endif (DEFINED SYSCONF_INSTALL_DIR)
|
||||
|
||||
set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc)
|
||||
set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale)
|
||||
include(GNUInstallDirs)
|
||||
|
||||
######## Configuration
|
||||
|
||||
option(BUILD_STATIC "Build static library"
|
||||
ON)
|
||||
option(BUILD_BINARY "Build the CLI tool." ON)
|
||||
option(BUILD_SHARED_LIBS "Build shared library and link executable to it." ON)
|
||||
option(CHECK_SSE2 "Check and enable SSE2 extensions if supported. Disabling SSE on platforms which support it may decrease performances." ON)
|
||||
set(TARGET_ARCHITECTURE "" CACHE STRING "Target CPU architecture. It is autodetected if not specified.")
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
option(BUILD_STATIC "Build static library" ON)
|
||||
endif (BUILD_SHARED_LIBS)
|
||||
|
||||
if (TARGET_ARCHITECTURE STREQUAL "")
|
||||
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" TARGET_ARCHITECTURE)
|
||||
endif (TARGET_ARCHITECTURE STREQUAL "")
|
||||
|
||||
if (TARGET_ARCHITECTURE MATCHES ".*(x86|amd|i686).*")
|
||||
CHECK_C_COMPILER_FLAG(-msse2 SUPPORTS_CFLAG_SSE2)
|
||||
CHECK_C_COMPILER_FLAG(-mfpmath=sse SUPPORTS_CFLAG_SSE_MATH)
|
||||
if (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -mfpmath=sse")
|
||||
else (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffloat-store")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffloat-store")
|
||||
endif (CHECK_SSE2 AND SUPPORTS_CFLAG_SSE2 AND SUPPORTS_CFLAG_SSE_MATH)
|
||||
endif (TARGET_ARCHITECTURE MATCHES ".*(x86|amd|i686).*")
|
||||
|
||||
configure_file(
|
||||
uchardet.pc.in
|
||||
@ -70,9 +64,9 @@ configure_file(
|
||||
|
||||
install(
|
||||
FILES
|
||||
${CMAKE_BINARY_DIR}/uchardet.pc
|
||||
${CMAKE_CURRENT_BINARY_DIR}/uchardet.pc
|
||||
DESTINATION
|
||||
${DIR_LIBRARY}/pkgconfig
|
||||
${CMAKE_INSTALL_LIBDIR}/pkgconfig
|
||||
)
|
||||
|
||||
######## Subdirectories
|
||||
@ -80,3 +74,39 @@ install(
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(doc)
|
||||
add_subdirectory(test)
|
||||
|
||||
######## Exported targets
|
||||
|
||||
install(
|
||||
EXPORT UchardetTargets
|
||||
FILE ${PACKAGE_NAME}-targets.cmake
|
||||
NAMESPACE ${PACKAGE_NAME}::
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PACKAGE_NAME}
|
||||
)
|
||||
|
||||
export(
|
||||
EXPORT UchardetTargets
|
||||
FILE "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-targets.cmake"
|
||||
NAMESPACE ${PACKAGE_NAME}::
|
||||
)
|
||||
|
||||
include(CMakePackageConfigHelpers)
|
||||
write_basic_package_version_file(
|
||||
${PACKAGE_NAME}-config-version.cmake
|
||||
VERSION ${UCHARDET_VERSION}
|
||||
COMPATIBILITY AnyNewerVersion
|
||||
)
|
||||
|
||||
configure_file(
|
||||
${PACKAGE_NAME}-config.cmake.in
|
||||
${PACKAGE_NAME}-config.cmake
|
||||
@ONLY
|
||||
)
|
||||
|
||||
install (
|
||||
FILES
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config.cmake"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}-config-version.cmake"
|
||||
DESTINATION
|
||||
${CMAKE_INSTALL_LIBDIR}/cmake/${PACKAGE_NAME}
|
||||
)
|
||||
|
||||
846
COPYING
846
COPYING
@ -468,3 +468,849 @@ EXHIBIT A -Mozilla Public License.
|
||||
use the text of this Exhibit A rather than the text found in the
|
||||
Original Code Source Code for Your Modifications.]
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 2, June 1991
|
||||
|
||||
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
License is intended to guarantee your freedom to share and change free
|
||||
software--to make sure the software is free for all its users. This
|
||||
General Public License applies to most of the Free Software
|
||||
Foundation's software and to any other program whose authors commit to
|
||||
using it. (Some other Free Software Foundation software is covered by
|
||||
the GNU Lesser General Public License instead.) You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
this service if you wish), that you receive source code or can get it
|
||||
if you want it, that you can change the software or use pieces of it
|
||||
in new free programs; and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
anyone to deny you these rights or to ask you to surrender the rights.
|
||||
These restrictions translate to certain responsibilities for you if you
|
||||
distribute copies of the software, or if you modify it.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must give the recipients all the rights that
|
||||
you have. You must make sure that they, too, receive or can get the
|
||||
source code. And you must show them these terms so they know their
|
||||
rights.
|
||||
|
||||
We protect your rights with two steps: (1) copyright the software, and
|
||||
(2) offer you this license which gives you legal permission to copy,
|
||||
distribute and/or modify the software.
|
||||
|
||||
Also, for each author's protection and ours, we want to make certain
|
||||
that everyone understands that there is no warranty for this free
|
||||
software. If the software is modified by someone else and passed on, we
|
||||
want its recipients to know that what they have is not the original, so
|
||||
that any problems introduced by others will not reflect on the original
|
||||
authors' reputations.
|
||||
|
||||
Finally, any free program is threatened constantly by software
|
||||
patents. We wish to avoid the danger that redistributors of a free
|
||||
program will individually obtain patent licenses, in effect making the
|
||||
program proprietary. To prevent this, we have made it clear that any
|
||||
patent must be licensed for everyone's free use or not licensed at all.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License applies to any program or other work which contains
|
||||
a notice placed by the copyright holder saying it may be distributed
|
||||
under the terms of this General Public License. The "Program", below,
|
||||
refers to any such program or work, and a "work based on the Program"
|
||||
means either the Program or any derivative work under copyright law:
|
||||
that is to say, a work containing the Program or a portion of it,
|
||||
either verbatim or with modifications and/or translated into another
|
||||
language. (Hereinafter, translation is included without limitation in
|
||||
the term "modification".) Each licensee is addressed as "you".
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running the Program is not restricted, and the output from the Program
|
||||
is covered only if its contents constitute a work based on the
|
||||
Program (independent of having been made by running the Program).
|
||||
Whether that is true depends on what the Program does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Program's
|
||||
source code as you receive it, in any medium, provided that you
|
||||
conspicuously and appropriately publish on each copy an appropriate
|
||||
copyright notice and disclaimer of warranty; keep intact all the
|
||||
notices that refer to this License and to the absence of any warranty;
|
||||
and give any other recipients of the Program a copy of this License
|
||||
along with the Program.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy, and
|
||||
you may at your option offer warranty protection in exchange for a fee.
|
||||
|
||||
2. You may modify your copy or copies of the Program or any portion
|
||||
of it, thus forming a work based on the Program, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) You must cause the modified files to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
b) You must cause any work that you distribute or publish, that in
|
||||
whole or in part contains or is derived from the Program or any
|
||||
part thereof, to be licensed as a whole at no charge to all third
|
||||
parties under the terms of this License.
|
||||
|
||||
c) If the modified program normally reads commands interactively
|
||||
when run, you must cause it, when started running for such
|
||||
interactive use in the most ordinary way, to print or display an
|
||||
announcement including an appropriate copyright notice and a
|
||||
notice that there is no warranty (or else, saying that you provide
|
||||
a warranty) and that users may redistribute the program under
|
||||
these conditions, and telling the user how to view a copy of this
|
||||
License. (Exception: if the Program itself is interactive but
|
||||
does not normally print such an announcement, your work based on
|
||||
the Program is not required to print an announcement.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Program,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Program, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Program.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Program
|
||||
with the Program (or with a work based on the Program) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may copy and distribute the Program (or a work based on it,
|
||||
under Section 2) in object code or executable form under the terms of
|
||||
Sections 1 and 2 above provided that you also do one of the following:
|
||||
|
||||
a) Accompany it with the complete corresponding machine-readable
|
||||
source code, which must be distributed under the terms of Sections
|
||||
1 and 2 above on a medium customarily used for software interchange; or,
|
||||
|
||||
b) Accompany it with a written offer, valid for at least three
|
||||
years, to give any third party, for a charge no more than your
|
||||
cost of physically performing source distribution, a complete
|
||||
machine-readable copy of the corresponding source code, to be
|
||||
distributed under the terms of Sections 1 and 2 above on a medium
|
||||
customarily used for software interchange; or,
|
||||
|
||||
c) Accompany it with the information you received as to the offer
|
||||
to distribute corresponding source code. (This alternative is
|
||||
allowed only for noncommercial distribution and only if you
|
||||
received the program in object code or executable form with such
|
||||
an offer, in accord with Subsection b above.)
|
||||
|
||||
The source code for a work means the preferred form of the work for
|
||||
making modifications to it. For an executable work, complete source
|
||||
code means all the source code for all modules it contains, plus any
|
||||
associated interface definition files, plus the scripts used to
|
||||
control compilation and installation of the executable. However, as a
|
||||
special exception, the source code distributed need not include
|
||||
anything that is normally distributed (in either source or binary
|
||||
form) with the major components (compiler, kernel, and so on) of the
|
||||
operating system on which the executable runs, unless that component
|
||||
itself accompanies the executable.
|
||||
|
||||
If distribution of executable or object code is made by offering
|
||||
access to copy from a designated place, then offering equivalent
|
||||
access to copy the source code from the same place counts as
|
||||
distribution of the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
4. You may not copy, modify, sublicense, or distribute the Program
|
||||
except as expressly provided under this License. Any attempt
|
||||
otherwise to copy, modify, sublicense or distribute the Program is
|
||||
void, and will automatically terminate your rights under this License.
|
||||
However, parties who have received copies, or rights, from you under
|
||||
this License will not have their licenses terminated so long as such
|
||||
parties remain in full compliance.
|
||||
|
||||
5. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Program or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Program (or any work based on the
|
||||
Program), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Program or works based on it.
|
||||
|
||||
6. Each time you redistribute the Program (or any work based on the
|
||||
Program), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute or modify the Program subject to
|
||||
these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties to
|
||||
this License.
|
||||
|
||||
7. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Program at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Program by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Program.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under
|
||||
any particular circumstance, the balance of the section is intended to
|
||||
apply and the section as a whole is intended to apply in other
|
||||
circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system, which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
8. If the distribution and/or use of the Program is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Program under this License
|
||||
may add an explicit geographical distribution limitation excluding
|
||||
those countries, so that distribution is permitted only in or among
|
||||
countries not thus excluded. In such case, this License incorporates
|
||||
the limitation as if written in the body of this License.
|
||||
|
||||
9. The Free Software Foundation may publish revised and/or new versions
|
||||
of the General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Program
|
||||
specifies a version number of this License which applies to it and "any
|
||||
later version", you have the option of following the terms and conditions
|
||||
either of that version or of any later version published by the Free
|
||||
Software Foundation. If the Program does not specify a version number of
|
||||
this License, you may choose any version ever published by the Free Software
|
||||
Foundation.
|
||||
|
||||
10. If you wish to incorporate parts of the Program into other free
|
||||
programs whose distribution conditions are different, write to the author
|
||||
to ask for permission. For software which is copyrighted by the Free
|
||||
Software Foundation, write to the Free Software Foundation; we sometimes
|
||||
make exceptions for this. Our decision will be guided by the two goals
|
||||
of preserving the free status of all derivatives of our free software and
|
||||
of promoting the sharing and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
||||
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
||||
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
||||
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
||||
REPAIR OR CORRECTION.
|
||||
|
||||
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
||||
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
||||
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
||||
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
||||
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
||||
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program is interactive, make it output a short notice like this
|
||||
when it starts in an interactive mode:
|
||||
|
||||
Gnomovision version 69, Copyright (C) year name of author
|
||||
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, the commands you use may
|
||||
be called something other than `show w' and `show c'; they could even be
|
||||
mouse-clicks or menu items--whatever suits your program.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the program, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
||||
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1989
|
||||
Ty Coon, President of Vice
|
||||
|
||||
This General Public License does not permit incorporating your program into
|
||||
proprietary programs. If your program is a subroutine library, you may
|
||||
consider it more useful to permit linking proprietary applications with the
|
||||
library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License.
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 2.1, February 1999
|
||||
|
||||
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
[This is the first released version of the Lesser GPL. It also counts
|
||||
as the successor of the GNU Library Public License, version 2, hence
|
||||
the version number 2.1.]
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
Licenses are intended to guarantee your freedom to share and change
|
||||
free software--to make sure the software is free for all its users.
|
||||
|
||||
This license, the Lesser General Public License, applies to some
|
||||
specially designated software packages--typically libraries--of the
|
||||
Free Software Foundation and other authors who decide to use it. You
|
||||
can use it too, but we suggest you first think carefully about whether
|
||||
this license or the ordinary General Public License is the better
|
||||
strategy to use in any particular case, based on the explanations below.
|
||||
|
||||
When we speak of free software, we are referring to freedom of use,
|
||||
not price. Our General Public Licenses are designed to make sure that
|
||||
you have the freedom to distribute copies of free software (and charge
|
||||
for this service if you wish); that you receive source code or can get
|
||||
it if you want it; that you can change the software and use pieces of
|
||||
it in new free programs; and that you are informed that you can do
|
||||
these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
distributors to deny you these rights or to ask you to surrender these
|
||||
rights. These restrictions translate to certain responsibilities for
|
||||
you if you distribute copies of the library or if you modify it.
|
||||
|
||||
For example, if you distribute copies of the library, whether gratis
|
||||
or for a fee, you must give the recipients all the rights that we gave
|
||||
you. You must make sure that they, too, receive or can get the source
|
||||
code. If you link other code with the library, you must provide
|
||||
complete object files to the recipients, so that they can relink them
|
||||
with the library after making changes to the library and recompiling
|
||||
it. And you must show them these terms so they know their rights.
|
||||
|
||||
We protect your rights with a two-step method: (1) we copyright the
|
||||
library, and (2) we offer you this license, which gives you legal
|
||||
permission to copy, distribute and/or modify the library.
|
||||
|
||||
To protect each distributor, we want to make it very clear that
|
||||
there is no warranty for the free library. Also, if the library is
|
||||
modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
restrictive license from a patent holder. Therefore, we insist that
|
||||
any patent license obtained for a version of the library must be
|
||||
consistent with the full freedom of use specified in this license.
|
||||
|
||||
Most GNU software, including some libraries, is covered by the
|
||||
ordinary GNU General Public License. This license, the GNU Lesser
|
||||
General Public License, applies to certain designated libraries, and
|
||||
is quite different from the ordinary General Public License. We use
|
||||
this license for certain libraries in order to permit linking those
|
||||
libraries into non-free programs.
|
||||
|
||||
When a program is linked with a library, whether statically or using
|
||||
a shared library, the combination of the two is legally speaking a
|
||||
combined work, a derivative of the original library. The ordinary
|
||||
General Public License therefore permits such linking only if the
|
||||
entire combination fits its criteria of freedom. The Lesser General
|
||||
Public License permits more lax criteria for linking other code with
|
||||
the library.
|
||||
|
||||
We call this license the "Lesser" General Public License because it
|
||||
does Less to protect the user's freedom than the ordinary General
|
||||
Public License. It also provides other free software developers Less
|
||||
of an advantage over competing non-free programs. These disadvantages
|
||||
are the reason we use the ordinary General Public License for many
|
||||
libraries. However, the Lesser license provides advantages in certain
|
||||
special circumstances.
|
||||
|
||||
For example, on rare occasions, there may be a special need to
|
||||
encourage the widest possible use of a certain library, so that it becomes
|
||||
a de-facto standard. To achieve this, non-free programs must be
|
||||
allowed to use the library. A more frequent case is that a free
|
||||
library does the same job as widely used non-free libraries. In this
|
||||
case, there is little to gain by limiting the free library to free
|
||||
software only, so we use the Lesser General Public License.
|
||||
|
||||
In other cases, permission to use a particular library in non-free
|
||||
programs enables a greater number of people to use a large body of
|
||||
free software. For example, permission to use the GNU C Library in
|
||||
non-free programs enables many more people to use the whole GNU
|
||||
operating system, as well as its variant, the GNU/Linux operating
|
||||
system.
|
||||
|
||||
Although the Lesser General Public License is Less protective of the
|
||||
users' freedom, it does ensure that the user of a program that is
|
||||
linked with the Library has the freedom and the wherewithal to run
|
||||
that program using a modified version of the Library.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License Agreement applies to any software library or other
|
||||
program which contains a notice placed by the copyright holder or
|
||||
other authorized party saying it may be distributed under the terms of
|
||||
this Lesser General Public License (also called "this License").
|
||||
Each licensee is addressed as "you".
|
||||
|
||||
A "library" means a collection of software functions and/or data
|
||||
prepared so as to be conveniently linked with application programs
|
||||
(which use some of those functions and data) to form executables.
|
||||
|
||||
The "Library", below, refers to any such software library or work
|
||||
which has been distributed under these terms. A "work based on the
|
||||
Library" means either the Library or any derivative work under
|
||||
copyright law: that is to say, a work containing the Library or a
|
||||
portion of it, either verbatim or with modifications and/or translated
|
||||
straightforwardly into another language. (Hereinafter, translation is
|
||||
included without limitation in the term "modification".)
|
||||
|
||||
"Source code" for a work means the preferred form of the work for
|
||||
making modifications to it. For a library, complete source code means
|
||||
all the source code for all modules it contains, plus any associated
|
||||
interface definition files, plus the scripts used to control compilation
|
||||
and installation of the library.
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running a program using the Library is not restricted, and output from
|
||||
such a program is covered only if its contents constitute a work based
|
||||
on the Library (independent of the use of the Library in a tool for
|
||||
writing it). Whether that is true depends on what the Library does
|
||||
and what the program that uses the Library does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Library's
|
||||
complete source code as you receive it, in any medium, provided that
|
||||
you conspicuously and appropriately publish on each copy an
|
||||
appropriate copyright notice and disclaimer of warranty; keep intact
|
||||
all the notices that refer to this License and to the absence of any
|
||||
warranty; and distribute a copy of this License along with the
|
||||
Library.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) The modified work must itself be a software library.
|
||||
|
||||
b) You must cause the files modified to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
c) You must cause the whole of the work to be licensed at no
|
||||
charge to all third parties under the terms of this License.
|
||||
|
||||
d) If a facility in the modified Library refers to a function or a
|
||||
table of data to be supplied by an application program that uses
|
||||
the facility, other than as an argument passed when the facility
|
||||
is invoked, then you must make a good faith effort to ensure that,
|
||||
in the event an application does not supply such function or
|
||||
table, the facility still operates, and performs whatever part of
|
||||
its purpose remains meaningful.
|
||||
|
||||
(For example, a function in a library to compute square roots has
|
||||
a purpose that is entirely well-defined independent of the
|
||||
application. Therefore, Subsection 2d requires that any
|
||||
application-supplied function or table used by this function must
|
||||
be optional: if the application does not supply it, the square
|
||||
root function must still compute square roots.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Library,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Library, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote
|
||||
it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Library.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Library
|
||||
with the Library (or with a work based on the Library) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may opt to apply the terms of the ordinary GNU General Public
|
||||
License instead of this License to a given copy of the Library. To do
|
||||
this, you must alter all the notices that refer to this License, so
|
||||
that they refer to the ordinary GNU General Public License, version 2,
|
||||
instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
|
||||
This option is useful when you wish to copy part of the code of
|
||||
the Library into a program that is not a library.
|
||||
|
||||
4. You may copy and distribute the Library (or a portion or
|
||||
derivative of it, under Section 2) in object code or executable form
|
||||
under the terms of Sections 1 and 2 above provided that you accompany
|
||||
it with the complete corresponding machine-readable source code, which
|
||||
must be distributed under the terms of Sections 1 and 2 above on a
|
||||
medium customarily used for software interchange.
|
||||
|
||||
If distribution of object code is made by offering access to copy
|
||||
from a designated place, then offering equivalent access to copy the
|
||||
source code from the same place satisfies the requirement to
|
||||
distribute the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
5. A program that contains no derivative of any portion of the
|
||||
Library, but is designed to work with the Library by being compiled or
|
||||
linked with it, is called a "work that uses the Library". Such a
|
||||
work, in isolation, is not a derivative work of the Library, and
|
||||
therefore falls outside the scope of this License.
|
||||
|
||||
However, linking a "work that uses the Library" with the Library
|
||||
creates an executable that is a derivative of the Library (because it
|
||||
contains portions of the Library), rather than a "work that uses the
|
||||
library". The executable is therefore covered by this License.
|
||||
Section 6 states terms for distribution of such executables.
|
||||
|
||||
When a "work that uses the Library" uses material from a header file
|
||||
that is part of the Library, the object code for the work may be a
|
||||
derivative work of the Library even though the source code is not.
|
||||
Whether this is true is especially significant if the work can be
|
||||
linked without the Library, or if the work is itself a library. The
|
||||
threshold for this to be true is not precisely defined by law.
|
||||
|
||||
If such an object file uses only numerical parameters, data
|
||||
structure layouts and accessors, and small macros and small inline
|
||||
functions (ten lines or less in length), then the use of the object
|
||||
file is unrestricted, regardless of whether it is legally a derivative
|
||||
work. (Executables containing this object code plus portions of the
|
||||
Library will still fall under Section 6.)
|
||||
|
||||
Otherwise, if the work is a derivative of the Library, you may
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
under terms of your choice, provided that the terms permit
|
||||
modification of the work for the customer's own use and reverse
|
||||
engineering for debugging such modifications.
|
||||
|
||||
You must give prominent notice with each copy of the work that the
|
||||
Library is used in it and that the Library and its use are covered by
|
||||
this License. You must supply a copy of this License. If the work
|
||||
during execution displays copyright notices, you must include the
|
||||
copyright notice for the Library among them, as well as a reference
|
||||
directing the user to the copy of this License. Also, you must do one
|
||||
of these things:
|
||||
|
||||
a) Accompany the work with the complete corresponding
|
||||
machine-readable source code for the Library including whatever
|
||||
changes were used in the work (which must be distributed under
|
||||
Sections 1 and 2 above); and, if the work is an executable linked
|
||||
with the Library, with the complete machine-readable "work that
|
||||
uses the Library", as object code and/or source code, so that the
|
||||
user can modify the Library and then relink to produce a modified
|
||||
executable containing the modified Library. (It is understood
|
||||
that the user who changes the contents of definitions files in the
|
||||
Library will not necessarily be able to recompile the application
|
||||
to use the modified definitions.)
|
||||
|
||||
b) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (1) uses at run time a
|
||||
copy of the library already present on the user's computer system,
|
||||
rather than copying library functions into the executable, and (2)
|
||||
will operate properly with a modified version of the library, if
|
||||
the user installs one, as long as the modified version is
|
||||
interface-compatible with the version that the work was made with.
|
||||
|
||||
c) Accompany the work with a written offer, valid for at
|
||||
least three years, to give the same user the materials
|
||||
specified in Subsection 6a, above, for a charge no more
|
||||
than the cost of performing this distribution.
|
||||
|
||||
d) If distribution of the work is made by offering access to copy
|
||||
from a designated place, offer equivalent access to copy the above
|
||||
specified materials from the same place.
|
||||
|
||||
e) Verify that the user has already received a copy of these
|
||||
materials or that you have already sent this user a copy.
|
||||
|
||||
For an executable, the required form of the "work that uses the
|
||||
Library" must include any data and utility programs needed for
|
||||
reproducing the executable from it. However, as a special exception,
|
||||
the materials to be distributed need not include anything that is
|
||||
normally distributed (in either source or binary form) with the major
|
||||
components (compiler, kernel, and so on) of the operating system on
|
||||
which the executable runs, unless that component itself accompanies
|
||||
the executable.
|
||||
|
||||
It may happen that this requirement contradicts the license
|
||||
restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
library, provided that the separate distribution of the work based on
|
||||
the Library and of the other library facilities is otherwise
|
||||
permitted, and provided that you do these two things:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work
|
||||
based on the Library, uncombined with any other library
|
||||
facilities. This must be distributed under the terms of the
|
||||
Sections above.
|
||||
|
||||
b) Give prominent notice with the combined library of the fact
|
||||
that part of it is a work based on the Library, and explaining
|
||||
where to find the accompanying uncombined form of the same work.
|
||||
|
||||
8. You may not copy, modify, sublicense, link with, or distribute
|
||||
the Library except as expressly provided under this License. Any
|
||||
attempt otherwise to copy, modify, sublicense, link with, or
|
||||
distribute the Library is void, and will automatically terminate your
|
||||
rights under this License. However, parties who have received copies,
|
||||
or rights, from you under this License will not have their licenses
|
||||
terminated so long as such parties remain in full compliance.
|
||||
|
||||
9. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Library or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Library (or any work based on the
|
||||
Library), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Library or works based on it.
|
||||
|
||||
10. Each time you redistribute the Library (or any work based on the
|
||||
Library), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute, link with or modify the Library
|
||||
subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Library at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Library by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Library.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under any
|
||||
particular circumstance, the balance of the section is intended to apply,
|
||||
and the section as a whole is intended to apply in other circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
12. If the distribution and/or use of the Library is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Library under this License may add
|
||||
an explicit geographical distribution limitation excluding those countries,
|
||||
so that distribution is permitted only in or among countries not thus
|
||||
excluded. In such case, this License incorporates the limitation as if
|
||||
written in the body of this License.
|
||||
|
||||
13. The Free Software Foundation may publish revised and/or new
|
||||
versions of the Lesser General Public License from time to time.
|
||||
Such new versions will be similar in spirit to the present version,
|
||||
but may differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Library
|
||||
specifies a version number of this License which applies to it and
|
||||
"any later version", you have the option of following the terms and
|
||||
conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
copyrighted by the Free Software Foundation, write to the Free
|
||||
Software Foundation; we sometimes make exceptions for this. Our
|
||||
decision will be guided by the two goals of preserving the free status
|
||||
of all derivatives of our free software and of promoting the sharing
|
||||
and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
||||
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
||||
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
||||
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
||||
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
||||
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
||||
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
||||
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
||||
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
||||
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
possible use to the public, we recommend making it free software that
|
||||
everyone can redistribute and change. You can do so by permitting
|
||||
redistribution under these terms (or, alternatively, under the terms of the
|
||||
ordinary General Public License).
|
||||
|
||||
To apply these terms, attach the following notices to the library. It is
|
||||
safest to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least the
|
||||
"copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the library's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the library, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the
|
||||
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1990
|
||||
Ty Coon, President of Vice
|
||||
|
||||
That's all there is to it!
|
||||
|
||||
30
INSTALL
30
INSTALL
@ -1,4 +1,26 @@
|
||||
Execute release.sh or manually make a directory and check in, and execute
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=Release && make
|
||||
Then install
|
||||
sudo make install
|
||||
# Building uchardet (generic)
|
||||
|
||||
`uchardet` uses a typical cmake installation.
|
||||
|
||||
* Configure with `cmake`. There are various options. For instance to configure
|
||||
with a prefix as a release-ready build:
|
||||
|
||||
> cmake -DCMAKE_INSTALL_PREFIX=/home/jehan/.local -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
Alternatively, use `ccmake`, curses interface to `cmake`.
|
||||
|
||||
* Build with `make`.
|
||||
|
||||
* Install with `make install`.
|
||||
|
||||
Read `README` for more details on uchardet.
|
||||
|
||||
# Building uchardet on Windows
|
||||
|
||||
The above procedure is generic, which means it should work on any platform.
|
||||
In particular, it works well on Linux.
|
||||
|
||||
The procedure is the same on Windows, but if you want more details (for
|
||||
instance which tools to use in order to run CMake on Windows, compiler
|
||||
information, etc.), the following link may be useful:
|
||||
https://github.com/BYVoid/uchardet/issues/39#issuecomment-353873891
|
||||
|
||||
423
README.md
423
README.md
@ -1,55 +1,230 @@
|
||||
# uchardet
|
||||
|
||||
[uchardet](https://github.com/BYVoid/uchardet) is a C language binding of the original C++ implementation of the universal charset detection library by Mozilla.
|
||||
[uchardet](https://www.freedesktop.org/wiki/Software/uchardet/) is an encoding and language detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text.
|
||||
|
||||
uchardet is an encoding detector library, which takes a sequence of bytes in an unknown character encoding without any additional information, and attempts to determine the encoding of the text. Returned encoding names are iconv-compatible.
|
||||
* Returned encoding names are [iconv](https://www.gnu.org/software/libiconv/)-compatible.
|
||||
* Returned language codes are ISO 639-1.
|
||||
|
||||
The original code of universalchardet is available at http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/
|
||||
uchardet started as a C language binding of the original C++ implementation of the universal charset detection library by Mozilla. Since this far-away time, it can now detect more charsets, and much more reliably than the original implementation. Moreover it also work as a very good language detector, while still staying reasonably fast.
|
||||
|
||||
Techniques used by universalchardet are described at http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||
|
||||
## Supported Encodings
|
||||
## Supported Languages/Encodings
|
||||
|
||||
* International (Unicode)
|
||||
* UTF-8
|
||||
* UTF-16BE / UTF-16LE
|
||||
* UTF-32BE / UTF-32LE / X-ISO-10646-UCS-4-34121 / X-ISO-10646-UCS-4-21431
|
||||
* Arabic
|
||||
* UTF-8
|
||||
* ISO-8859-6
|
||||
* WINDOWS-1256
|
||||
* Belarusian
|
||||
* UTF-8
|
||||
* ISO-8859-5
|
||||
* WINDOWS-1251
|
||||
* Bulgarian
|
||||
* UTF-8
|
||||
* ISO-8859-5
|
||||
* WINDOWS-1251
|
||||
* Catalan
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* WINDOWS-1252
|
||||
* Chinese
|
||||
* UTF-8
|
||||
* ISO-2022-CN
|
||||
* BIG5
|
||||
* EUC-TW
|
||||
* GB18030
|
||||
* HZ-GB-2312
|
||||
* Croatian:
|
||||
* UTF-8
|
||||
* ISO-8859-2
|
||||
* ISO-8859-13
|
||||
* ISO-8859-16
|
||||
* Windows-1250
|
||||
* IBM852
|
||||
* MAC-CENTRALEUROPE
|
||||
* Czech
|
||||
* UTF-8
|
||||
* Windows-1250
|
||||
* ISO-8859-2
|
||||
* IBM852
|
||||
* MAC-CENTRALEUROPE
|
||||
* Danish
|
||||
* UTF-8
|
||||
* IBM865
|
||||
* ISO-8859-1
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* English
|
||||
* UTF-8
|
||||
* ASCII
|
||||
* Esperanto
|
||||
* UTF-8
|
||||
* ISO-8859-3
|
||||
* Estonian
|
||||
* UTF-8
|
||||
* ISO-8859-4
|
||||
* ISO-8859-13
|
||||
* ISO-8859-15
|
||||
* Windows-1252
|
||||
* Windows-1257
|
||||
* Finnish
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-4
|
||||
* ISO-8859-9
|
||||
* ISO-8859-13
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* French
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* German
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* WINDOWS-1252
|
||||
* Georgian
|
||||
* UTF-8
|
||||
* GEORGIAN-ACADEMY
|
||||
* GEORGIAN-PS
|
||||
* Greek
|
||||
* UTF-8
|
||||
* ISO-8859-7
|
||||
* WINDOWS-1253
|
||||
* CP737
|
||||
* Hebrew
|
||||
* UTF-8
|
||||
* ISO-8859-8
|
||||
* WINDOWS-1255
|
||||
* IBM862
|
||||
* Hindi
|
||||
* UTF-8
|
||||
* Hungarian:
|
||||
* UTF-8
|
||||
* ISO-8859-2
|
||||
* WINDOWS-1250
|
||||
* Irish Gaelic
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-9
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* Italian
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-3
|
||||
* ISO-8859-9
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* Japanese
|
||||
* UTF-8
|
||||
* ISO-2022-JP
|
||||
* SHIFT_JIS
|
||||
* EUC-JP
|
||||
* Korean
|
||||
* UTF-8
|
||||
* ISO-2022-KR
|
||||
* EUC-KR
|
||||
* Cyrillic
|
||||
* EUC-KR / UHC
|
||||
* Johab
|
||||
* Latvian
|
||||
* UTF-8
|
||||
* ISO-8859-4
|
||||
* ISO-8859-10
|
||||
* ISO-8859-13
|
||||
* Lithuanian
|
||||
* UTF-8
|
||||
* ISO-8859-4
|
||||
* ISO-8859-10
|
||||
* ISO-8859-13
|
||||
* Maltese
|
||||
* UTF-8
|
||||
* ISO-8859-3
|
||||
* Macedonian
|
||||
* UTF-8
|
||||
* ISO-8859-5
|
||||
* WINDOWS-1251
|
||||
* IBM855
|
||||
* Norwegian
|
||||
* UTF-8
|
||||
* IBM865
|
||||
* ISO-8859-1
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* Polish:
|
||||
* UTF-8
|
||||
* ISO-8859-2
|
||||
* ISO-8859-13
|
||||
* ISO-8859-16
|
||||
* Windows-1250
|
||||
* IBM852
|
||||
* MAC-CENTRALEUROPE
|
||||
* Portuguese
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-9
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* Romanian:
|
||||
* UTF-8
|
||||
* ISO-8859-2
|
||||
* ISO-8859-16
|
||||
* Windows-1250
|
||||
* IBM852
|
||||
* Russian
|
||||
* UTF-8
|
||||
* ISO-8859-5
|
||||
* KOI8-R
|
||||
* WINDOWS-1251
|
||||
* MAC-CYRILLIC
|
||||
* IBM866
|
||||
* IBM855
|
||||
* Greek
|
||||
* ISO-8859-7
|
||||
* WINDOWS-1253
|
||||
* Hebrew
|
||||
* ISO-8859-8
|
||||
* WINDOWS-1255
|
||||
* Thai
|
||||
* TIS-620
|
||||
* French
|
||||
* Serbian
|
||||
* UTF-8
|
||||
* ISO-8859-5
|
||||
* WINDOWS-1251
|
||||
* Slovak
|
||||
* UTF-8
|
||||
* Windows-1250
|
||||
* ISO-8859-2
|
||||
* IBM852
|
||||
* MAC-CENTRALEUROPE
|
||||
* Slovene
|
||||
* UTF-8
|
||||
* ISO-8859-2
|
||||
* ISO-8859-16
|
||||
* Windows-1250
|
||||
* IBM852
|
||||
* MAC-CENTRALEUROPE
|
||||
* Spanish
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-15
|
||||
* English
|
||||
* ASCII
|
||||
* Hungarian:
|
||||
* ISO-8859-2
|
||||
* WINDOWS-1250
|
||||
* WINDOWS-1252
|
||||
* Swedish
|
||||
* UTF-8
|
||||
* ISO-8859-1
|
||||
* ISO-8859-4
|
||||
* ISO-8859-9
|
||||
* ISO-8859-15
|
||||
* WINDOWS-1252
|
||||
* Thai
|
||||
* UTF-8
|
||||
* TIS-620
|
||||
* ISO-8859-11
|
||||
* Turkish:
|
||||
* UTF-8
|
||||
* ISO-8859-3
|
||||
* ISO-8859-9
|
||||
* Ukrainian:
|
||||
* UTF-8
|
||||
* WINDOWS-1251
|
||||
* Vietnamese:
|
||||
* UTF-8
|
||||
* VISCII
|
||||
* Windows-1258
|
||||
* Others
|
||||
* WINDOWS-1252
|
||||
|
||||
@ -63,26 +238,111 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj
|
||||
|
||||
urpmi libuchardet libuchardet-devel
|
||||
|
||||
### Fedora
|
||||
|
||||
dnf install uchardet uchardet-devel
|
||||
|
||||
### Gentoo
|
||||
|
||||
emerge uchardet
|
||||
|
||||
### Mac
|
||||
|
||||
brew install uchardet
|
||||
|
||||
or
|
||||
|
||||
port install uchardet
|
||||
|
||||
### Windows
|
||||
|
||||
Binary packages are provided in Fedora and Msys2 repositories. There may
|
||||
exist other pre-built packages but I am not aware of them.
|
||||
Nevertheless the library is very easily and quickly compilable under
|
||||
Windows as well, so finding a binary package is not necessary.
|
||||
Some did it successfully with the [CMake Windows
|
||||
installer](https://cmake.org/download/) and MinGW. It should be possible
|
||||
to use MinGW-w64 instead of MinGW, in particular to build both 32 and
|
||||
64-bit DLL libraries).
|
||||
|
||||
Note also that it is very easily cross-buildable (for instance from a
|
||||
GNU/Linux machine; [crossroad](https://pypi.org/project/crossroad/) may
|
||||
help, this is what we use in our CI).
|
||||
|
||||
### Build from source
|
||||
|
||||
Releases are available from:
|
||||
https://www.freedesktop.org/software/uchardet/releases/
|
||||
|
||||
If you prefer a development version, clone the git repository:
|
||||
|
||||
git clone https://gitlab.freedesktop.org/uchardet/uchardet.git
|
||||
|
||||
The source can be browsed at: https://gitlab.freedesktop.org/uchardet/uchardet
|
||||
|
||||
cmake .
|
||||
make
|
||||
make install
|
||||
|
||||
### Build with flatpak-builder
|
||||
|
||||
Here is a working "module" section to include in your Flatpak's json manifest:
|
||||
|
||||
```
|
||||
"modules": [
|
||||
{
|
||||
"name": "uchardet",
|
||||
"buildsystem": "cmake",
|
||||
"builddir": true,
|
||||
"config-opts": [ "-DCMAKE_INSTALL_LIBDIR=lib" ],
|
||||
"sources": [
|
||||
{
|
||||
...
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Build with CMake exported targets
|
||||
|
||||
uchardet installs a standard pkg-config file which will make it easily
|
||||
discoverable by any modern build system. Nevertheless if your project also uses
|
||||
CMake and you want to discover uchardet installation using CMake exported
|
||||
targets, you may find and link uchardet with:
|
||||
|
||||
```
|
||||
project(sample LANGUAGES C)
|
||||
find_package ( uchardet )
|
||||
if (uchardet_FOUND)
|
||||
add_executable( sample sample.c )
|
||||
target_link_libraries ( sample PRIVATE uchardet::libuchardet )
|
||||
endif ()
|
||||
```
|
||||
|
||||
Note though that we recommend the library discovery with `pkg-config` because it
|
||||
is standard and generic. Therefore it will always work, even if we decided to
|
||||
change our own build system (which is not planned right now, but may always
|
||||
happen). This is why we advise to use standard `pkg-config` discovery.
|
||||
|
||||
Some more CMake specificities may be found in the [commit
|
||||
message](https://gitlab.freedesktop.org/uchardet/uchardet/-/commit/d7dad549bd5a3442b92e861bcd2c5cda2adeea27)
|
||||
which implemented such support.
|
||||
|
||||
## Usage
|
||||
|
||||
### Command Line
|
||||
|
||||
uchardet comes with a command line tool which obviously uses its own
|
||||
library. It can be considered as a demo of `libuchardet` even though one
|
||||
can find it very useful on its own right to inspect files.
|
||||
|
||||
```
|
||||
uchardet Command Line Tool
|
||||
Version 0.0.4
|
||||
Version 0.1.0
|
||||
|
||||
Authors: BYVoid, Jehan
|
||||
Bug Report: https://github.com/BYVoid/uchardet/issues
|
||||
Bug Report: https://gitlab.freedesktop.org/uchardet/uchardet/-/issues
|
||||
|
||||
Usage:
|
||||
uchardet [Options] [File]...
|
||||
@ -90,13 +350,94 @@ Usage:
|
||||
Options:
|
||||
-v, --version Print version and build information.
|
||||
-h, --help Print this help.
|
||||
-V, --verbose Show all candidates and their confidence value.
|
||||
-w, --weight Tweak language weights.
|
||||
```
|
||||
|
||||
### Library
|
||||
|
||||
See [uchardet.h](https://github.com/BYVoid/uchardet/blob/master/src/uchardet.h)
|
||||
See [uchardet.h](https://gitlab.freedesktop.org/uchardet/uchardet/-/blob/master/src/uchardet.h)
|
||||
|
||||
## History
|
||||
|
||||
As said in introduction, this was initially a project of Mozilla to
|
||||
allow better detection of page encodings, and it used to be part of
|
||||
Firefox. If not mistaken, this is not the case anymore (probably because
|
||||
nowadays most websites better announce their encoding, and also UTF-8 is
|
||||
much more widely spread) and the original code has been abandoned.
|
||||
|
||||
It is to be noted that a lot has changed since the original
|
||||
implementation, yet the base concept is still the same, basing detection
|
||||
not just on encoding rules, but most importantly on analysis of
|
||||
character statistics in languages.
|
||||
|
||||
Original code of `universalchardet` by Mozilla can still be retrieved from the
|
||||
[Wayback machine](https://web.archive.org/web/20150730144356/http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/).
|
||||
|
||||
1. Mozilla code was extracted and packaged into a standalone library under
|
||||
the name `uchardet` by BYVoid in 2011, in a personal repository.
|
||||
2. Starting 2015, I (i.e. Jehan) started contributing, "standardized"
|
||||
the output to be iconv-compatible, added various encoding/language
|
||||
support and streamlined generation of sources for new support of
|
||||
encoding/languages by using texts from Wikipedia as statistics source
|
||||
on languages through Python scripts. I soon became co-maintainer.
|
||||
3. In 2016, `uchardet` became a freedesktop project.
|
||||
4. Since 2015, the number of supported encoding continuously increased,
|
||||
in particular version 0.0.6 (2016) and especially 0.0.7 (2020) added
|
||||
a lot of new supported charset-language couples.
|
||||
5. In 2021, I added language detection support.
|
||||
|
||||
## Techniques used
|
||||
|
||||
Techniques used originally by universalchardet are described at:
|
||||
https://www-archive.mozilla.org/projects/intl/universalcharsetdetection
|
||||
|
||||
As said in the "*History*" section, the base algorithm is still there,
|
||||
helping detection of charset with analysis of character statistics in
|
||||
languages.
|
||||
|
||||
This is also why it could evolve in a quite efficient language detector.
|
||||
|
||||
Furthermore it does not use any dictionary, doesn't do semantics, or
|
||||
nothing of the sort. The drawback of this is that it can be wrong
|
||||
sometimes, especially on very short texts (a few words) when we don't
|
||||
have enough data to differentiate while a word search in a dictionnary
|
||||
could have done the trick. The advantages are that it makes it perform
|
||||
much faster, with very small memory usage while still being extremely
|
||||
performant on discriminating among a lot of charsets and languages when
|
||||
your text is long enough.
|
||||
|
||||
## Supporting the project financially
|
||||
|
||||
I don't have a specific job around uchardet but I work on making Free
|
||||
Software exclusively. In particular I develop
|
||||
[GIMP](https://www.gimp.org/) and other Free Software within
|
||||
[ZeMarmot](https://film.zemarmot.net/) project.
|
||||
Thus uchardet is just one of the many FLOSS code I make.
|
||||
|
||||
So if you want to support my Free Software code, I suggest to donate to
|
||||
*ZeMarmot* in one of these ways:
|
||||
|
||||
* Liberapay: https://liberapay.com/ZeMarmot/
|
||||
* Patreon: https://www.patreon.com/zemarmot
|
||||
* Tipeee: https://en.tipeee.com/zemarmot
|
||||
* Other (Paypal, bank transfer…): https://film.zemarmot.net/en/donate
|
||||
|
||||
It might sound weird to fund a Libre Art animation film (Creative
|
||||
Commons by-sa) to support the development of uchardet, but this is
|
||||
exactly what happens if you do, as part of the donation go into salary
|
||||
for me. And we need more funding to continue working on Free Software
|
||||
for a living.
|
||||
|
||||
## Related Projects
|
||||
|
||||
Some of these are bindings of `uchardet`, others are forks of the same
|
||||
initial code, which has diverged over time, others are native port in
|
||||
other languages.
|
||||
This list is not exhaustive and only meant as point of interest. We
|
||||
don't follow the status for these projects.
|
||||
|
||||
* [R-uchardet](https://cran.r-project.org/package=uchardet) R binding on CRAN
|
||||
* [python-chardet](https://github.com/chardet/chardet) Python port
|
||||
* [ruby-rchardet](http://rubyforge.org/projects/chardet/) Ruby port
|
||||
* [juniversalchardet](http://code.google.com/p/juniversalchardet/) Java port of universalchardet
|
||||
@ -105,8 +446,34 @@ See [uchardet.h](https://github.com/BYVoid/uchardet/blob/master/src/uchardet.h)
|
||||
* [nchardet](http://www.conceptdevelopment.net/Localization/NCharDet/) C# port of chardet
|
||||
* [uchardet-enhanced](https://bitbucket.org/medoc/uchardet-enhanced) A fork of mozilla universalchardet
|
||||
* [rust-uchardet](https://github.com/emk/rust-uchardet) Rust language binding of uchardet
|
||||
* [libchardet](https://ftp.oops.org/pub/oops/libchardet/) Another C/C++ API wrapping Mozilla code.
|
||||
* [libchardet](https://github.com/Joungkyun/libchardet) Another C/C++ API wrapping Mozilla code.
|
||||
|
||||
## License
|
||||
## Used by
|
||||
|
||||
[Mozilla Public License Version 1.1](http://www.mozilla.org/MPL/1.1/)
|
||||
* [mpv](https://mpv.io/) for subtitle detection
|
||||
* [Notepad++](https://notepad-plus-plus.org/) for file encoding detection
|
||||
* [Tepl](https://wiki.gnome.org/Projects/Tepl) (gedit…)
|
||||
* [Nextcloud IOS app](https://github.com/nextcloud/ios)
|
||||
* [Codelite](https://codelite.org)
|
||||
* [QtAV](https://www.qtav.org/)
|
||||
* …
|
||||
|
||||
## Licenses
|
||||
|
||||
* [Mozilla Public License Version 1.1](http://www.mozilla.org/MPL/1.1/)
|
||||
* [GNU General Public License, version 2.0](http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or later.
|
||||
* [GNU Lesser General Public License, version 2.1](http://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html) or later.
|
||||
|
||||
See the file `COPYING` for the complete text of these 3 licenses.
|
||||
|
||||
## Code of Conduct
|
||||
|
||||
The `uchardet` project is hosted by [freedesktop.org](https://www.freedesktop.org/)
|
||||
and as such follows its code of conduct. In other words, it means we
|
||||
will treat anyone with respect and expect anyone to do the same.
|
||||
|
||||
Please read [freedesktop.org Code of Conduct](https://www.freedesktop.org/wiki/CodeOfConduct).
|
||||
|
||||
In case of any problem regarding abusive behavior in uchardet project,
|
||||
please contact the maintainer (Jehan) or create a bug report (possibly
|
||||
private if needed).
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
<key>IDESourceControlProjectOriginsDictionary</key>
|
||||
<dict>
|
||||
<key>7DF9952B76987F54D66D74171482E11FAF33FDCB</key>
|
||||
<string>https://github.com/BYVoid/uchardet</string>
|
||||
<string>https://www.freedesktop.org/wiki/Software/uchardet/</string>
|
||||
</dict>
|
||||
<key>IDESourceControlProjectPath</key>
|
||||
<string>build-mac/uchardet.xcodeproj</string>
|
||||
@ -21,7 +21,7 @@
|
||||
<string>../../..</string>
|
||||
</dict>
|
||||
<key>IDESourceControlProjectURL</key>
|
||||
<string>https://github.com/BYVoid/uchardet</string>
|
||||
<string>https://www.freedesktop.org/wiki/Software/uchardet/</string>
|
||||
<key>IDESourceControlProjectVersion</key>
|
||||
<integer>111</integer>
|
||||
<key>IDESourceControlProjectWCCIdentifier</key>
|
||||
|
||||
@ -2,5 +2,5 @@ install(
|
||||
FILES
|
||||
uchardet.1
|
||||
DESTINATION
|
||||
${DIR_SHARE}/man/man1
|
||||
${CMAKE_INSTALL_MANDIR}/man1
|
||||
)
|
||||
|
||||
63
doc/README.maintainer
Normal file
63
doc/README.maintainer
Normal file
@ -0,0 +1,63 @@
|
||||
# How to do a uchardet release #
|
||||
|
||||
* Update UCHARDET_VERSION_MAJOR, UCHARDET_VERSION_MINOR and
|
||||
UCHARDET_VERSION_REVISION as needed in CMakeLists.txt.
|
||||
|
||||
* Update README.md.
|
||||
|
||||
* Commit the version change with the message "Release: version X.Y.Z."
|
||||
|
||||
* In the next commands, let's suppose such environment variable is set:
|
||||
|
||||
export UCHARDET_VERSION=x.y.z
|
||||
|
||||
* Tag and sign your release commit with:
|
||||
|
||||
git tag -s v$UCHARDET_VERSION
|
||||
|
||||
The tag message should be have the header "Version x.y.z released." followed
|
||||
by a list of new features or important fixes. This tag message will be
|
||||
considered as the release note, hence have to be carefully crafted.
|
||||
|
||||
Considering that the previous release was va.b.c, you can read the full list
|
||||
of commits between a.b.c and x.y.z with:
|
||||
|
||||
git log va.b.c..
|
||||
|
||||
This should help you to build a proper release note.
|
||||
|
||||
* Push the release and the tag:
|
||||
|
||||
git push
|
||||
git push origin v$UCHARDET_VERSION
|
||||
|
||||
* Create a release tarball:
|
||||
|
||||
git archive --format=tar.xz --prefix=uchardet-$UCHARDET_VERSION/ v$UCHARDET_VERSION >uchardet-$UCHARDET_VERSION.tar.xz
|
||||
|
||||
Note: if you have not already set this up, you have to run first:
|
||||
|
||||
git config tar.tar.xz.command "xz -c"
|
||||
|
||||
Cf. EXAMPLES section in `git help archive`.
|
||||
|
||||
* Compute a SHA1 checksum:
|
||||
|
||||
sha256sum uchardet-$UCHARDET_VERSION.tar.xz > uchardet-$UCHARDET_VERSION.tar.xz.sha256
|
||||
|
||||
* Upload to annarchy download server:
|
||||
|
||||
scp uchardet-$UCHARDET_VERSION.tar.xz uchardet-$UCHARDET_VERSION.tar.xz.sha256 annarchy.freedesktop.org:/srv/www.freedesktop.org/www/software/uchardet/releases/
|
||||
|
||||
The archive and its checksum file should now be available from:
|
||||
https://www.freedesktop.org/software/uchardet/releases/
|
||||
|
||||
* Make the git tag into a Gitlab release (not automatic).
|
||||
It will be found at: https://gitlab.freedesktop.org/uchardet/uchardet/-/tags/vx.y.z
|
||||
Just click the "Edit release notes" button, and copy paste the tag comment as "release notes".
|
||||
|
||||
* Update the wiki page: https://www.freedesktop.org/wiki/Software/uchardet/
|
||||
The release note link will be:
|
||||
https://gitlab.freedesktop.org/uchardet/uchardet/-/releases/vx.y.z
|
||||
|
||||
* Spread the good news!
|
||||
@ -1,8 +1,8 @@
|
||||
.TH UCHARDET "1" "July 2011" "uchardet " "User Commands"
|
||||
.SH NAME
|
||||
uchardet \- universalchardet (Universal Charset Detector)
|
||||
uchardet \- Universal Charset Detector
|
||||
.SH DESCRIPTION
|
||||
uchardet Command Line Tool
|
||||
uchardet CLI is an encoding detector utility, which takes one or several files in unknown character encoding without any additional information, and attempts to determine the encoding of the texts. Returned encoding names are iconv-compatible.
|
||||
.SS "Usage:"
|
||||
.HP
|
||||
uchardet [\fBOptions\fR] [\fBFile\fR]...
|
||||
@ -15,4 +15,4 @@ Print help text.
|
||||
.HP
|
||||
.IP
|
||||
.PP
|
||||
uchardet Command Line Tool
|
||||
uchardet Command Line Interface
|
||||
|
||||
@ -39,8 +39,11 @@
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
# Third party modules.
|
||||
import unicodedata
|
||||
import subprocess
|
||||
import wikipedia
|
||||
import importlib
|
||||
import math
|
||||
import optparse
|
||||
import datetime
|
||||
import operator
|
||||
@ -48,6 +51,7 @@ import requests
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
import random
|
||||
|
||||
# Custom modules.
|
||||
import charsets.db
|
||||
@ -68,34 +72,57 @@ cmdline.add_option('--max-depth',
|
||||
dest = 'max_depth', default = 2)
|
||||
(options, langs) = cmdline.parse_args()
|
||||
if len(langs) < 1:
|
||||
print("Please select at least one language code.\n")
|
||||
sys.stderr.write("Please select at least one language code. ")
|
||||
sys.stderr.write("You may also choose 'all' or 'none'.\n")
|
||||
exit(1)
|
||||
if len(langs) > 1:
|
||||
print("This script is meant to generate data for one language at a time.\n")
|
||||
|
||||
current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
with open(os.path.join(current_dir, "support.txt")) as f:
|
||||
all_langs = f.readlines()
|
||||
all_langs = [ l.strip() for l in all_langs if l.strip() != '' ]
|
||||
|
||||
if len(langs) == 1:
|
||||
if langs[0].lower() == 'none':
|
||||
langs = []
|
||||
elif langs[0].lower() == 'all':
|
||||
langs = all_langs
|
||||
|
||||
abort = False
|
||||
for lang in langs:
|
||||
if lang not in all_langs:
|
||||
abort = True
|
||||
sys.stderr.write("Error: unsupported lang: {}\n".format(lang))
|
||||
if abort:
|
||||
sys.stderr.write("Info: new langs must be added in 'script/support.txt'.\n")
|
||||
exit(1)
|
||||
lang = langs[0]
|
||||
|
||||
generated_files = []
|
||||
|
||||
for lang_arg in langs:
|
||||
lang_arg = lang_arg.lower()
|
||||
|
||||
# Load the language data.
|
||||
sys_path_backup = sys.path
|
||||
current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
sys.path = [current_dir + '/langs']
|
||||
|
||||
try:
|
||||
lang = importlib.import_module(lang.lower())
|
||||
lang = importlib.import_module(lang_arg)
|
||||
except ImportError:
|
||||
print('Unknown language code "{}": '
|
||||
'file "langs/{}.py" does not exist.'.format(lang, lang.lower()))
|
||||
sys.stderr.write('Unknown language code "{}": '
|
||||
'file "langs/{}.py" does not exist.'.format(lang_arg, lang_arg))
|
||||
exit(1)
|
||||
sys.path = sys_path_backup
|
||||
|
||||
charsets = charsets.db.load(lang.charsets)
|
||||
print("Processing language data for {} (lang/{}.py):\n".format(lang_arg, lang_arg))
|
||||
|
||||
lang_charsets = charsets.db.load(lang.charsets)
|
||||
|
||||
if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
|
||||
lang.start_pages == []:
|
||||
# Let's start with the main page, assuming it should have links
|
||||
# to relevant pages. In locale wikipedia, this page is usually redirected
|
||||
# to a relevant page.
|
||||
print("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
|
||||
sys.stderr.write("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
|
||||
" If you don't get good data, it is advised to set a "
|
||||
"start_pages` variable yourself.".format(lang.code))
|
||||
lang.start_pages = ['Main_Page']
|
||||
@ -107,16 +134,96 @@ if hasattr(lang, 'case_mapping'):
|
||||
lang.case_mapping = bool(lang.case_mapping)
|
||||
else:
|
||||
lang.case_mapping = False
|
||||
if not hasattr(lang, 'custom_case_mapping'):
|
||||
lang.custom_case_mapping = None
|
||||
if not hasattr(lang, 'alphabet') or lang.alphabet is None:
|
||||
lang.alphabet = None
|
||||
if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
|
||||
lang.alphabet_mapping = None
|
||||
if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
|
||||
lang.unicode_ranges = None
|
||||
if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
|
||||
if lang.unicode_ranges is not None:
|
||||
lang.frequent_ranges = lang.unicode_ranges
|
||||
else:
|
||||
lang.frequent_ranges = None
|
||||
|
||||
def local_lowercase(text, lang):
|
||||
lowercased = ''
|
||||
for l in text:
|
||||
if lang.custom_case_mapping is not None and \
|
||||
l in lang.custom_case_mapping:
|
||||
lowercased += lang.custom_case_mapping[l]
|
||||
elif l.isupper() and \
|
||||
lang.case_mapping and \
|
||||
len(unicodedata.normalize('NFC', l.lower())) == 1:
|
||||
lowercased += l.lower()
|
||||
else:
|
||||
lowercased += l
|
||||
return lowercased
|
||||
|
||||
if lang.alphabet is not None:
|
||||
if lang.use_ascii:
|
||||
if lang.alphabet is None:
|
||||
lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
|
||||
else:
|
||||
# Allowing to provide an alphabet in string format rather than list.
|
||||
lang.alphabet = list(lang.alphabet)
|
||||
lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
|
||||
if lang.case_mapping:
|
||||
lang.alphabet = list(set([ l.lower() for l in lang.alphabet ]))
|
||||
if lang.alphabet is not None:
|
||||
# Allowing to provide an alphabet in string format rather than list.
|
||||
lang.alphabet = list(lang.alphabet)
|
||||
if lang.case_mapping or lang.custom_case_mapping is not None:
|
||||
lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
|
||||
#alphabet = []
|
||||
#for l in lang.alphabet:
|
||||
#if l.isupper() and \
|
||||
#lang.custom_case_mapping is not None and \
|
||||
#l in lang.custom_case_mapping:
|
||||
#alphabet.append(lang.custom_case_mapping[l])
|
||||
#elif l.isupper() and \
|
||||
#lang.case_mapping and \
|
||||
#len(unicodedata.normalize('NFC', l.lower())) == 1:
|
||||
#alphabet.append(l.lower())
|
||||
#else:
|
||||
#alphabet.append(l)
|
||||
lang.alphabet = list(set(lang.alphabet))
|
||||
|
||||
if lang.alphabet_mapping is not None:
|
||||
alphabet_mapping = {}
|
||||
for char in lang.alphabet_mapping:
|
||||
# Allowing to provide an alphabet in string format rather than list.
|
||||
for alt_char in list(lang.alphabet_mapping[char]):
|
||||
# While it's easier to write from main character to
|
||||
# equivalencies in the language file, we reverse the mapping
|
||||
# for simpler usage.
|
||||
if lang.case_mapping or lang.custom_case_mapping is not None:
|
||||
alphabet_mapping[alt_char] = local_lowercase(char, lang)
|
||||
else:
|
||||
alphabet_mapping[alt_char] = char
|
||||
lang.alphabet_mapping = alphabet_mapping
|
||||
|
||||
def normalize_codepoint_ranges(input_range):
|
||||
output_range = []
|
||||
if input_range is not None:
|
||||
for start, end in input_range:
|
||||
# Allow to write down characters rather than unicode values.
|
||||
if isinstance(start, str):
|
||||
start = ord(start)
|
||||
if isinstance(end, str):
|
||||
end = ord(end)
|
||||
if not isinstance(start, int) or not isinstance(end, int):
|
||||
sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
|
||||
if start > end:
|
||||
sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
|
||||
else:
|
||||
output_range += [(start, end)]
|
||||
if len(output_range) == 0:
|
||||
output_range = None
|
||||
return output_range
|
||||
|
||||
lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
|
||||
lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
|
||||
|
||||
# Starting processing.
|
||||
wikipedia.set_lang(lang.wikipedia_code)
|
||||
|
||||
@ -132,34 +239,64 @@ characters = {}
|
||||
sequences = {}
|
||||
prev_char = None
|
||||
|
||||
def process_text(text, clean_text, case_mapping):
|
||||
global charsets
|
||||
def process_text(content, lang):
|
||||
global lang_charsets
|
||||
global characters
|
||||
global sequences
|
||||
global prev_char
|
||||
|
||||
if clean_text is not None:
|
||||
content = clean_text(text)
|
||||
if lang.clean_wikipedia_content is not None:
|
||||
content = lang.clean_wikipedia_content(content)
|
||||
# Clean out the Wikipedia syntax for titles.
|
||||
content = re.sub(r'(=+) *([^=]+) *\1',
|
||||
r'\2', content)
|
||||
# Clean multiple spaces. Newlines and such are normalized to spaces,
|
||||
# since they have basically a similar role in the purpose of uchardet.
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
|
||||
if case_mapping:
|
||||
content = content.lower()
|
||||
if lang.case_mapping or lang.custom_case_mapping is not None:
|
||||
content = local_lowercase(content, lang)
|
||||
|
||||
# In python 3, strings are UTF-8.
|
||||
# Looping through them return expected characters.
|
||||
for char in content:
|
||||
# Map to main equivalent character.
|
||||
if lang.alphabet_mapping is not None and \
|
||||
char in lang.alphabet_mapping:
|
||||
char = lang.alphabet_mapping[char]
|
||||
|
||||
unicode_value = ord(char)
|
||||
is_letter = False
|
||||
if ord(char) in characters:
|
||||
characters[ord(char)] += 1
|
||||
if unicode_value in characters:
|
||||
characters[unicode_value] += 1
|
||||
is_letter = True
|
||||
elif lang.unicode_ranges is not None:
|
||||
for start, end in lang.unicode_ranges:
|
||||
if unicode_value >= start and unicode_value <= end:
|
||||
characters[unicode_value] = 1
|
||||
is_letter = True
|
||||
break
|
||||
else:
|
||||
# We save the character if it is at least in one of the
|
||||
# language encodings and its not a special character.
|
||||
for charset in charsets:
|
||||
for charset in lang_charsets:
|
||||
# Does the character exist in the charset?
|
||||
try:
|
||||
codepoint = char.encode(charset, 'ignore')
|
||||
except LookupError:
|
||||
# unknown encoding. Use iconv from command line instead.
|
||||
try:
|
||||
call = subprocess.Popen(['iconv', '-f', 'UTF-8', '-t', charset],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL)
|
||||
if call.poll() is not None:
|
||||
(_, error) = call.communicate(input='')
|
||||
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
|
||||
exit(1)
|
||||
(codepoint, _) = call.communicate(input=char.encode('UTF-8'))
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
|
||||
exit(1)
|
||||
|
||||
if codepoint == b'':
|
||||
continue
|
||||
@ -168,17 +305,17 @@ def process_text(text, clean_text, case_mapping):
|
||||
# charsets if I turn the string to encoded bytes first.
|
||||
# Not sure if that is a bug or expected.
|
||||
codepoint = ord(codepoint)
|
||||
if charsets[charset].charmap[codepoint] == LET:
|
||||
characters[ord(char)] = 1
|
||||
if lang_charsets[charset].charmap[codepoint] == LET:
|
||||
characters[unicode_value] = 1
|
||||
is_letter = True
|
||||
break
|
||||
if is_letter:
|
||||
if prev_char is not None:
|
||||
if (prev_char, ord(char)) in sequences:
|
||||
sequences[(prev_char, ord(char))] += 1
|
||||
if (prev_char, unicode_value) in sequences:
|
||||
sequences[(prev_char, unicode_value)] += 1
|
||||
else:
|
||||
sequences[(prev_char, ord(char))] = 1
|
||||
prev_char = ord(char)
|
||||
sequences[(prev_char, unicode_value)] = 1
|
||||
prev_char = unicode_value
|
||||
else:
|
||||
prev_char = None
|
||||
|
||||
@ -190,29 +327,47 @@ def visit_pages(titles, depth, lang, logfd):
|
||||
return
|
||||
|
||||
next_titles = []
|
||||
if options.max_page is not None:
|
||||
max_titles = int(options.max_page/(options.max_depth * options.max_depth))
|
||||
else:
|
||||
max_titles = sys.maxsize
|
||||
for title in titles:
|
||||
if options.max_page is not None and \
|
||||
len(visited_pages) > options.max_page:
|
||||
return
|
||||
if title in visited_pages:
|
||||
continue
|
||||
|
||||
# Ugly hack skipping internal pages
|
||||
if 'wiki' in title or 'Wiki' in title:
|
||||
sys.stderr.write('Skipping {}'.format(title))
|
||||
continue
|
||||
|
||||
visited_pages += [title]
|
||||
try:
|
||||
page = wikipedia.page(title)
|
||||
page = wikipedia.page(title, auto_suggest=False)
|
||||
except (wikipedia.exceptions.PageError,
|
||||
wikipedia.exceptions.DisambiguationError):
|
||||
wikipedia.exceptions.DisambiguationError) as error:
|
||||
# Let's just discard a page when I get an exception.
|
||||
sys.stderr.write("Discarding page {}: {}\n".format(title, error))
|
||||
continue
|
||||
logfd.write("\n{} (revision {})".format(title, page.revision_id))
|
||||
logfd.flush()
|
||||
|
||||
process_text(page.content,
|
||||
lang.clean_wikipedia_content,
|
||||
lang.case_mapping)
|
||||
next_titles += page.links
|
||||
process_text(page.content, lang)
|
||||
try:
|
||||
links = page.links
|
||||
random.shuffle(links)
|
||||
if len(links) > max_titles:
|
||||
links = links[:max_titles]
|
||||
next_titles += links
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
if depth >= options.max_depth:
|
||||
return
|
||||
|
||||
random.shuffle(next_titles)
|
||||
visit_pages (next_titles, depth + 1, lang, logfd)
|
||||
|
||||
language_c = lang.name.replace('-', '_').title()
|
||||
@ -225,13 +380,15 @@ logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
|
||||
if options.max_page is not None:
|
||||
logfd.write('\n- Max number of pages: {}'.format(options.max_page))
|
||||
logfd.write('\n\n== Parsed pages ==\n')
|
||||
logfd.flush()
|
||||
try:
|
||||
visit_pages(lang.start_pages, 0, lang, logfd)
|
||||
except requests.exceptions.ConnectionError:
|
||||
print('Error: connection to Wikipedia failed. Aborting\n')
|
||||
sys.stderr.write('Error: connection to Wikipedia failed. Aborting\n')
|
||||
exit(1)
|
||||
logfd.write('\n\n== End of Parsed pages ==')
|
||||
logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
|
||||
logfd.flush()
|
||||
|
||||
########### CHARACTERS ###########
|
||||
|
||||
@ -257,35 +414,88 @@ accumulated_ratios = 0
|
||||
# If there is an alphabet, we make sure all the alphabet characters are in the
|
||||
# frequent list, and we stop then. There may therefore be more or less than
|
||||
# 64 frequent characters depending on the language.
|
||||
if lang.alphabet is None:
|
||||
freq_count = 64
|
||||
else:
|
||||
logfd.write('\nMost Frequent characters:')
|
||||
very_freq_count = 0
|
||||
very_freq_ratio = 0
|
||||
if lang.alphabet is None and lang.frequent_ranges is None:
|
||||
freq_count = min(64, len(sorted_ratios))
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if order >= freq_count:
|
||||
break
|
||||
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
||||
accumulated_ratios += ratio
|
||||
if very_freq_ratio < 0.4:
|
||||
very_freq_count += 1
|
||||
very_freq_ratio += ratio
|
||||
elif lang.alphabet is not None:
|
||||
freq_count = 0
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if len(lang.alphabet) == 0:
|
||||
break
|
||||
if chr(char) in lang.alphabet:
|
||||
lang.alphabet.remove(chr(char))
|
||||
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
||||
accumulated_ratios += ratio
|
||||
freq_count += 1
|
||||
if very_freq_ratio < 0.4:
|
||||
very_freq_count += 1
|
||||
very_freq_ratio += ratio
|
||||
else:
|
||||
if len(lang.alphabet) > 0:
|
||||
print("Error: alphabet characters are absent from data collection"
|
||||
sys.stderr.write("Error: alphabet characters are absent from data collection"
|
||||
"\n Please check the configuration or the data."
|
||||
"\n Missing characters: {}".format(", ".join(lang.alphabet)))
|
||||
exit(1)
|
||||
elif lang.frequent_ranges is not None:
|
||||
# How many characters in the frequent range?
|
||||
frequent_ranges_size = 0
|
||||
for start, end in lang.frequent_ranges:
|
||||
frequent_ranges_size += end - start + 1
|
||||
|
||||
logfd.write('\nFirst {} characters:'.format(freq_count))
|
||||
# Keep ratio for at least all the characters inside the frequent
|
||||
# ranges.
|
||||
freq_count = 0
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if order >= freq_count:
|
||||
break
|
||||
for start, end in lang.frequent_ranges:
|
||||
if char >= start and char <= end:
|
||||
freq_count += 1
|
||||
accumulated_ratios += ratio
|
||||
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
||||
frequent_ranges_size -= 1
|
||||
break
|
||||
else:
|
||||
# A frequent character in the non-frequent range.
|
||||
logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
|
||||
freq_count += 1
|
||||
accumulated_ratios += ratio
|
||||
|
||||
if very_freq_ratio < 0.4:
|
||||
very_freq_count += 1
|
||||
very_freq_ratio += ratio
|
||||
|
||||
if frequent_ranges_size <= 0:
|
||||
break
|
||||
|
||||
low_freq_order = freq_count - 1
|
||||
low_freq_ratio = 0
|
||||
for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
|
||||
if low_freq_ratio < 0.03:
|
||||
low_freq_ratio += ratio
|
||||
low_freq_order -= 1
|
||||
else:
|
||||
break
|
||||
|
||||
logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
|
||||
logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
|
||||
logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
|
||||
|
||||
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
|
||||
c_code = header_fd.read()
|
||||
|
||||
c_code += '\n#include "../nsSBCharSetProber.h"'
|
||||
c_code += '\n#include "../nsSBCharSetProber-generated.h"'
|
||||
c_code += '\n#include "../nsLanguageDetector.h"\n'
|
||||
c_code += '\n#include "../nsLanguageDetector-generated.h"\n'
|
||||
c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
|
||||
c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
|
||||
c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
|
||||
@ -312,7 +522,7 @@ c_code += \
|
||||
*/
|
||||
"""
|
||||
|
||||
for charset in charsets:
|
||||
for charset in lang_charsets:
|
||||
charset_c = charset.replace('-', '_').title()
|
||||
CTOM_str = 'static const unsigned char {}_CharToOrderMap[]'.format(charset_c)
|
||||
CTOM_str += ' =\n{'
|
||||
@ -320,7 +530,7 @@ for charset in charsets:
|
||||
CTOM_str += '\n '
|
||||
for column in range(0, 16):
|
||||
cp = line * 16 + column
|
||||
cp_type = charsets[charset].charmap[cp]
|
||||
cp_type = lang_charsets[charset].charmap[cp]
|
||||
if cp_type == ILL:
|
||||
CTOM_str += 'ILL,'
|
||||
elif cp_type == RET:
|
||||
@ -332,15 +542,53 @@ for charset in charsets:
|
||||
elif cp_type == NUM:
|
||||
CTOM_str += 'NUM,'
|
||||
else: # LET
|
||||
try:
|
||||
uchar = bytes([cp]).decode(charset)
|
||||
if lang.case_mapping and uchar.isupper():
|
||||
uchar = uchar.lower()
|
||||
except UnicodeDecodeError:
|
||||
sys.stderr.write('Unknown character 0X{:X} in {}.'.format(cp, charset))
|
||||
sys.stderr.write('Please verify your charset specification.\n')
|
||||
exit(1)
|
||||
except LookupError:
|
||||
# Unknown encoding. Use iconv instead.
|
||||
try:
|
||||
call = subprocess.Popen(['iconv', '-t', 'UTF-8', '-f', charset],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
if call.poll() is not None:
|
||||
(_, error) = call.communicate(input='')
|
||||
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
|
||||
exit(1)
|
||||
(uchar, _) = call.communicate(input=bytes([cp]))
|
||||
uchar = uchar.decode('UTF-8')
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
|
||||
exit(1)
|
||||
if len(uchar) == 0:
|
||||
sys.stderr.write('TypeError: iconv failed to return a unicode character for codepoint "{}" in charset {}.\n'.format(hex(cp), charset))
|
||||
exit(1)
|
||||
#if lang.case_mapping and uchar.isupper() and \
|
||||
#len(unicodedata.normalize('NFC', uchar.lower())) == 1:
|
||||
# Unless we encounter special cases of characters with no
|
||||
# composed lowercase, we lowercase it.
|
||||
if lang.case_mapping or lang.custom_case_mapping is not None:
|
||||
uchar = local_lowercase(uchar, lang)
|
||||
if lang.alphabet_mapping is not None and uchar in lang.alphabet_mapping:
|
||||
uchar = lang.alphabet_mapping[uchar]
|
||||
for order, (char, ratio) in enumerate(sorted_ratios):
|
||||
if char == ord(uchar):
|
||||
CTOM_str += '{:3},'.format(order)
|
||||
CTOM_str += '{:3},'.format(min(249, order))
|
||||
break
|
||||
else:
|
||||
CTOM_str += '{:3},'.format(n_char)
|
||||
# XXX: we must make sure the character order does not go
|
||||
# over the special characters (250 currently). This may
|
||||
# actually happen when building a model for a language
|
||||
# writable with many different encoding. So let's just
|
||||
# ceil the order value at 249 max.
|
||||
# It may be an interesting alternative to add another
|
||||
# constant for any character with an order > freqCharCount.
|
||||
# Maybe IRR (irrelevant character) or simply CHR.
|
||||
CTOM_str += '{:3},'.format(min(249, n_char))
|
||||
n_char += 1
|
||||
CTOM_str += ' /* {:X}X */'.format(line)
|
||||
CTOM_str += '\n};\n/*'
|
||||
@ -348,41 +596,135 @@ for charset in charsets:
|
||||
CTOM_str += ' */\n\n'
|
||||
c_code += CTOM_str
|
||||
|
||||
## UNICODE frequency.
|
||||
|
||||
# Since we can't map the full character table from encoding to order,
|
||||
# just create a list from the most common characters from the language.
|
||||
# The list is ordered by unicode code points (hence can be used
|
||||
# generically for various encoding scheme as it is not encoding
|
||||
# specific) allowing to search from code points efficiently by a divide
|
||||
# and conqueer search algorithm.
|
||||
# Each code point is immediately followed by its order.
|
||||
|
||||
# Keep the freq_count more frequent characters.
|
||||
sorted_chars = [(char, freq, order) for order, (char, freq) in
|
||||
enumerate(sorted_ratios)][:freq_count]
|
||||
max_order = len(sorted_chars)
|
||||
|
||||
# Add equivalency characters.
|
||||
equivalent = []
|
||||
if lang.case_mapping:
|
||||
for char, ratio, order in sorted_chars:
|
||||
uppercased = chr(char).upper()
|
||||
try:
|
||||
if char != ord(uppercased):
|
||||
equivalent += [(ord(uppercased), ratio, order)]
|
||||
except TypeError:
|
||||
# This happens for some case such as 'SS' as uppercase of 'ß'.
|
||||
# Just ignore such cases.
|
||||
sys.stderr.write("Ignoring '{}' as uppercase equivalent of '{}'.\n".format(uppercased, char))
|
||||
|
||||
if lang.alphabet_mapping is not None:
|
||||
for alt_c in lang.alphabet_mapping:
|
||||
for char, ratio, order in sorted_chars:
|
||||
if alt_c == chr(char):
|
||||
sys.stderr.write("ALREADY {}\n".format(alt_c))
|
||||
exit(1)
|
||||
elif char == ord(lang.alphabet_mapping[alt_c]):
|
||||
equivalent += [(ord(alt_c), ratio, order)]
|
||||
break
|
||||
else:
|
||||
sys.stderr.write("Base equivalent for {} not found in frequent characters!\n".format(alt_c))
|
||||
exit(1)
|
||||
|
||||
sorted_chars += equivalent
|
||||
|
||||
# Order by code point.
|
||||
sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))
|
||||
|
||||
CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))
|
||||
|
||||
CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
|
||||
CTOM_str += ' =\n{'
|
||||
column = 0
|
||||
|
||||
max_char_width = math.floor(math.log10(sorted_chars[-1][0])) + 1
|
||||
max_order_width = math.floor(math.log10(max_order)) + 1
|
||||
|
||||
for char, ratio, order in sorted_chars:
|
||||
if column % 8 == 0:
|
||||
CTOM_str += '\n '
|
||||
column += 1
|
||||
CTOM_str += '{}{:>{width}}, '.format('' if column % 8 == 0 else ' ', char, width=max_char_width)
|
||||
CTOM_str += '{:>{width}},'.format(order, width=max_order_width)
|
||||
|
||||
CTOM_str += '\n};\n\n'
|
||||
c_code += CTOM_str
|
||||
|
||||
########### SEQUENCES ###########
|
||||
|
||||
ratios = {}
|
||||
occurrences = sum(sequences.values())
|
||||
ratio_512 = 0
|
||||
ratio_1024 = 0
|
||||
|
||||
accumulated_seq_count = 0
|
||||
order_3 = -1
|
||||
order_2 = -1
|
||||
ratio_3 = -1
|
||||
ratio_2 = -1
|
||||
count_512 = -1
|
||||
count_1024 = -1
|
||||
sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
|
||||
reverse=True)
|
||||
for order, ((c1, c2), count) in enumerate(sorted_seqs):
|
||||
accumulated_seq_count += count
|
||||
if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
|
||||
order_3 = order
|
||||
ratio_3 = accumulated_seq_count / occurrences
|
||||
elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
|
||||
order_2 = order
|
||||
ratio_2 = accumulated_seq_count / occurrences
|
||||
if order < 512:
|
||||
ratio_512 += count
|
||||
count_512 += count
|
||||
elif order < 1024:
|
||||
ratio_1024 += count
|
||||
else:
|
||||
count_1024 += count
|
||||
|
||||
if order_3 != -1 and order_2 != -1:
|
||||
break
|
||||
ratio_512 /= occurrences
|
||||
ratio_1024 /= occurrences
|
||||
|
||||
if order_3 == -1 or order_2 == -1:
|
||||
# This would probably never happens. It would require a language with
|
||||
# very few possible sequences and each of the sequences are widely
|
||||
# used. Just add this code for completio, but it won't likely ever be
|
||||
# run.
|
||||
order_2 = 512
|
||||
order_3 = 1024
|
||||
ratio_2 = count_512 / occurrences
|
||||
ratio_3 = count_1024 / occurrences
|
||||
|
||||
logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
|
||||
|
||||
c_code += """
|
||||
/* Model Table:
|
||||
* Total sequences: {}
|
||||
* First 512 sequences: {}
|
||||
* Next 512 sequences (512-1024): {}
|
||||
* Rest: {}
|
||||
* Total considered sequences: {} / {}
|
||||
* - Positive sequences: first {} ({})
|
||||
* - Probable sequences: next {} ({}-{}) ({})
|
||||
* - Neutral sequences: last {} ({})
|
||||
* - Negative sequences: {} (off-ratio)
|
||||
* Negative sequences: TODO""".format(len(sorted_seqs),
|
||||
ratio_512,
|
||||
ratio_1024,
|
||||
1 - ratio_512 - ratio_1024)
|
||||
freq_count * freq_count,
|
||||
order_3, ratio_3,
|
||||
order_2 - order_3,
|
||||
order_2, order_3,
|
||||
ratio_2 - ratio_3,
|
||||
freq_count * freq_count - order_2,
|
||||
1 - ratio_2,
|
||||
freq_count * freq_count - len(sorted_seqs))
|
||||
|
||||
logfd.write("\nFirst 512 (typical positive ratio): {}".format(ratio_512))
|
||||
logfd.write("\nNext 512 (512-1024): {}".format(ratio))
|
||||
logfd.write("\nRest: {}".format(1 - ratio_512 - ratio_1024))
|
||||
logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
|
||||
logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
|
||||
order_2, order_3,
|
||||
ratio_2 - ratio_3))
|
||||
logfd.write("\nRest: {}".format(1 - ratio_2))
|
||||
|
||||
c_code += "\n */\n"
|
||||
|
||||
@ -402,9 +744,9 @@ for line in range(0, freq_count):
|
||||
if (first_char, second_char) in sequences:
|
||||
for order, (seq, _) in enumerate(sorted_seqs):
|
||||
if seq == (first_char, second_char):
|
||||
if order < 512:
|
||||
if order < order_3:
|
||||
LM_str += '3,'
|
||||
elif order < 1024:
|
||||
elif order < order_2:
|
||||
LM_str += '2,'
|
||||
else:
|
||||
LM_str += '1,'
|
||||
@ -421,18 +763,34 @@ for line in range(0, freq_count):
|
||||
LM_str += '\n};\n'
|
||||
c_code += LM_str
|
||||
|
||||
for charset in charsets:
|
||||
for charset in lang_charsets:
|
||||
charset_c = charset.replace('-', '_').title()
|
||||
SM_str = '\n\nconst SequenceModel {}{}Model ='.format(charset_c, language_c)
|
||||
SM_str += '\n{\n '
|
||||
SM_str += '{}_CharToOrderMap,\n {}LangModel,'.format(charset_c, language_c)
|
||||
SM_str += '\n {},'.format(freq_count)
|
||||
SM_str += '\n (float){},'.format(ratio_512)
|
||||
SM_str += '\n (float){},'.format(ratio_2)
|
||||
SM_str += '\n {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
|
||||
SM_str += '\n "{}"'.format(charset)
|
||||
SM_str += '\n "{}",'.format(charset)
|
||||
SM_str += '\n "{}"'.format(lang.code)
|
||||
SM_str += '\n};'
|
||||
c_code += SM_str
|
||||
|
||||
SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
|
||||
SM_str += '\n{'
|
||||
SM_str += '\n "{}",'.format(lang.code)
|
||||
SM_str += '\n Unicode_CharOrder,'
|
||||
SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
|
||||
SM_str += '\n {}LangModel,'.format(language_c)
|
||||
SM_str += '\n {},'.format(freq_count)
|
||||
SM_str += '\n {},'.format(very_freq_count)
|
||||
SM_str += '\n (float){},'.format(very_freq_ratio)
|
||||
SM_str += '\n {},'.format(low_freq_order)
|
||||
SM_str += '\n (float){},'.format(low_freq_ratio)
|
||||
SM_str += '\n};'
|
||||
c_code += SM_str
|
||||
|
||||
c_code += '\n'
|
||||
|
||||
lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
|
||||
with open(lang_model_file, 'w') as cpp_fd:
|
||||
@ -441,6 +799,89 @@ with open(lang_model_file, 'w') as cpp_fd:
|
||||
logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
|
||||
logfd.close()
|
||||
|
||||
print("The following language model file has been generated: {}"
|
||||
"\nThe build log is available in: {}"
|
||||
"\nTest them and commit them.".format(lang_model_file, build_log))
|
||||
generated_files += [ (lang_model_file, build_log) ]
|
||||
|
||||
charset_cpp = os.path.join(current_dir, '../src', 'nsSBCharSetProber-generated.h')
|
||||
print("\nGenerating {}…".format(charset_cpp))
|
||||
|
||||
with open(charset_cpp, 'w') as cpp_fd:
|
||||
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
|
||||
cpp_fd.write(header_fd.read())
|
||||
|
||||
cpp_fd.write('\n#ifndef nsSingleByteCharSetProber_generated_h__')
|
||||
cpp_fd.write('\n#define nsSingleByteCharSetProber_generated_h__\n')
|
||||
|
||||
all_extern_declarations = ''
|
||||
n_sequence_models = 0
|
||||
for l in all_langs:
|
||||
l = l.lower()
|
||||
# Load the language data.
|
||||
sys_path_backup = sys.path
|
||||
sys.path = [current_dir + '/langs']
|
||||
try:
|
||||
lang = importlib.import_module(l)
|
||||
except ImportError:
|
||||
sys.stderr.write('Unknown language code "{}": '
|
||||
'file "langs/{}.py" does not exist.'.format(l, l))
|
||||
exit(1)
|
||||
sys.path = sys_path_backup
|
||||
|
||||
language_c = lang.name.replace('-', '_').title()
|
||||
lang_charsets = charsets.db.load(lang.charsets)
|
||||
for charset in lang_charsets:
|
||||
charset_c = charset.replace('-', '_').title()
|
||||
all_extern_declarations += '\nextern const SequenceModel {}{}Model;'.format(charset_c, language_c)
|
||||
n_sequence_models += 1
|
||||
all_extern_declarations += '\n'
|
||||
|
||||
cpp_fd.write('\n#define NUM_OF_SEQUENCE_MODELS {}\n'.format(n_sequence_models))
|
||||
cpp_fd.write('{}'.format(all_extern_declarations))
|
||||
cpp_fd.write('\n#endif /* nsSingleByteCharSetProber_generated_h__ */')
|
||||
|
||||
print("Done!")
|
||||
|
||||
language_cpp = os.path.join(current_dir, '../src', 'nsLanguageDetector-generated.h')
|
||||
print("\nGenerating {}…".format(language_cpp))
|
||||
|
||||
with open(language_cpp, 'w') as cpp_fd:
|
||||
with open(current_dir + '/header-template.cpp', 'r') as header_fd:
|
||||
cpp_fd.write(header_fd.read())
|
||||
|
||||
cpp_fd.write('\n#ifndef nsLanguageDetector_h_generated_h__')
|
||||
cpp_fd.write('\n#define nsLanguageDetector_h_generated_h__\n')
|
||||
|
||||
all_extern_declarations = ''
|
||||
n_language_models = 0
|
||||
for l in all_langs:
|
||||
l = l.lower()
|
||||
# Load the language data.
|
||||
sys_path_backup = sys.path
|
||||
sys.path = [current_dir + '/langs']
|
||||
try:
|
||||
lang = importlib.import_module(l)
|
||||
except ImportError:
|
||||
sys.stderr.write('Unknown language code "{}": '
|
||||
'file "langs/{}.py" does not exist.'.format(l, l))
|
||||
exit(1)
|
||||
sys.path = sys_path_backup
|
||||
|
||||
language_c = lang.name.replace('-', '_').title()
|
||||
all_extern_declarations += '\nextern const LanguageModel {}Model;'.format(language_c)
|
||||
n_language_models += 1
|
||||
|
||||
cpp_fd.write('\n#define NUM_OF_LANGUAGE_MODELS {}\n'.format(n_language_models))
|
||||
cpp_fd.write('{}'.format(all_extern_declarations))
|
||||
cpp_fd.write('\n\n#endif /* nsLanguageDetector_h_generated_h__ */')
|
||||
|
||||
print("Done!")
|
||||
if len(generated_files) > 0:
|
||||
print("\nThe following language files has been generated:")
|
||||
for (lang_model_file, build_log) in generated_files:
|
||||
print("\n- Language file: {}".format(lang_model_file))
|
||||
print("\n Build log: {}".format(build_log))
|
||||
|
||||
print("\nTODO:")
|
||||
print("- edit nsSBCSGroupProber::nsSBCSGroupProber() in src/nsSBCSGroupProber.cpp manually to test new sequence models;")
|
||||
print("- edit nsMBCSGroupProber::nsMBCSGroupProber() in src/nsMBCSGroupProber.cpp manually to test new language models;")
|
||||
print("- add any new language files to src/CMakeLists.txt;")
|
||||
print("- commit generated files if tests are successful.")
|
||||
|
||||
266
script/BuildLangModelLogs/LangArabicModel.log
Normal file
266
script/BuildLangModelLogs/LangArabicModel.log
Normal file
@ -0,0 +1,266 @@
|
||||
= Logs of language model for Arabic (ar) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 21:44:44.406122
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
الصفحة_الرئيسية (revision 53908210)
|
||||
مرصد لاسيلا (revision 58136208)
|
||||
ويكي (revision 60117664)
|
||||
نيقولا الثاني إمبراطور روسيا (revision 59117389)
|
||||
الثورة الروسية (revision 59438419)
|
||||
مسجد (revision 60128354)
|
||||
السلطة الوطنية الفلسطينية (revision 59672379)
|
||||
أوتوقراطية (revision 58739029)
|
||||
تصوير سينمائي (revision 60120015)
|
||||
تاريخ الطب (revision 59644751)
|
||||
العراق (revision 60057865)
|
||||
كاتدرائية (revision 59359806)
|
||||
كنيس (revision 57336394)
|
||||
تاريخ علم البصريات (revision 59375471)
|
||||
جورج ميلييس (revision 58054424)
|
||||
طب الغدد الصماء التناسلية والعقم (revision 53959553)
|
||||
جورج ليديارد ستودارد (revision 59720930)
|
||||
مساءلة القيادة (revision 57653475)
|
||||
شعب الله (revision 59561049)
|
||||
تلسكوب شميدت (revision 58346125)
|
||||
إقليم كوكيمبو (revision 52162500)
|
||||
واي باك مشين (revision 60022392)
|
||||
مذبحة معالوت (revision 60163667)
|
||||
عيد الجمهورية (revision 58550344)
|
||||
الحزب الاشتراكي الثوري (revision 60052479)
|
||||
علم النباتات الشعبي (revision 58976134)
|
||||
رئيس أساقفة كانتربيري (revision 60048548)
|
||||
تبادل الملابس (revision 53664095)
|
||||
نادي الشرطة (العراق) (revision 60163019)
|
||||
ميتزفة (revision 59891451)
|
||||
ربيع الأول (revision 57475879)
|
||||
بوليوود (revision 60021177)
|
||||
معاهدة لوزان (revision 58993178)
|
||||
بيلاروس (revision 60058526)
|
||||
فلسطين (revision 60126849)
|
||||
مخرج أفلام (revision 60120194)
|
||||
نجمة داود (revision 60123232)
|
||||
برديات طبية مصرية (revision 58379785)
|
||||
أفغانستان (revision 60102590)
|
||||
تجربة إيفري ومكلاود ومكارتي (revision 54052592)
|
||||
مجلس الدوما في الإمبراطورية الروسية (revision 59315526)
|
||||
معركة كورونل (revision 57430974)
|
||||
حاخام (revision 59724648)
|
||||
سفارة دولة فلسطين لدى السلفادور (revision 51285917)
|
||||
جهاز اقتران الشحنة (revision 60106498)
|
||||
الاتحاد الأوروبي (revision 60165557)
|
||||
ملكيانية (revision 54689229)
|
||||
محافظة حلبجة (revision 60048671)
|
||||
الإصلاح المضاد (revision 59469285)
|
||||
يعقوب (revision 60016678)
|
||||
صورة كامنة (revision 54674215)
|
||||
فلسطين المحتلة (revision 59919879)
|
||||
مرصد لاس كامباناس (revision 57843262)
|
||||
عبرية توراتية (لغة) (revision 59267911)
|
||||
كاتدرائية آخن (revision 59713924)
|
||||
تصوير حراري (revision 60057711)
|
||||
بيوتر نيكولايفيتش رنجل (revision 60102081)
|
||||
قيادة تحولية (revision 59308120)
|
||||
البرازيل خلال الحرب العالمية الأولى (revision 59085904)
|
||||
بنو إسرائيل (revision 59670914)
|
||||
الحملة السنوسية (revision 49545895)
|
||||
كوكب صغير (revision 60016777)
|
||||
قدار بن سالف (revision 58324529)
|
||||
طب الكوارث (revision 54112608)
|
||||
حزبقراطية (revision 54704842)
|
||||
فيديريكا موغيريني (revision 58573119)
|
||||
محمود عباس (revision 60159687)
|
||||
مسألة كاف 300 (revision 60163851)
|
||||
صحراء أتاكاما (revision 57646517)
|
||||
تونس (revision 60058567)
|
||||
أصل الأنواع (revision 59997199)
|
||||
1656 (revision 60060584)
|
||||
حساس (تصوير) (revision 60154421)
|
||||
الحرب الروسية اليابانية (revision 58917765)
|
||||
إنتل سات (revision 60107796)
|
||||
الموحدون الدروز والمسيحية (revision 59557380)
|
||||
اقتصاد تشاركي (revision 60121575)
|
||||
المرصد الأوروبي الجنوبي (revision 60105009)
|
||||
تشيلي (revision 59597855)
|
||||
معاهدة برست ليتوفسك (revision 59823236)
|
||||
منظمة سيرفاس الدولية (revision 60155535)
|
||||
مسجد البازار (غيروكاستر) (revision 53489072)
|
||||
كاتدرائية فلورنسا (revision 59732540)
|
||||
مصور سينمائي (revision 59163732)
|
||||
آيا صوفيا (revision 60004674)
|
||||
زكريا (revision 60043552)
|
||||
العلاقات العراقية النيجيرية (revision 50931777)
|
||||
العالمية السياسية (revision 60136194)
|
||||
اتفاقية سانت جان دي مورين (revision 58361635)
|
||||
بقايا نجمية متراصة (revision 60136947)
|
||||
ليئة (revision 57605271)
|
||||
خزف إسلامي (revision 60136331)
|
||||
فيينا (revision 59970205)
|
||||
جون بيرشنغ (revision 58136100)
|
||||
ألكسندر الأول (revision 59225816)
|
||||
كاري موليس (revision 58379354)
|
||||
هوليوود (revision 58383299)
|
||||
مرصد مولارد الراديوي (revision 60124524)
|
||||
الوكالة الأوروبية لأبحاث الفضاء (revision 60066050)
|
||||
شعار الدرع (revision 58202021)
|
||||
سبخة (revision 49219084)
|
||||
المتحف البريطاني (revision 58799935)
|
||||
تمييز (revision 59720154)
|
||||
إدموند ألنبي (revision 59888141)
|
||||
قم (revision 59290213)
|
||||
أسماء الله في اليهودية (revision 55763993)
|
||||
الانتداب الفرنسي على لبنان (revision 58593971)
|
||||
الوضع السياسي للقدس (revision 60123213)
|
||||
إمبراطورية المغول (revision 60166300)
|
||||
نقل بالسكك الحديدية (revision 58081869)
|
||||
استقطاب سياسي (revision 58743074)
|
||||
غارة عين الصاحب (revision 59043299)
|
||||
راينهارد جنزيل (revision 60106037)
|
||||
حملة أرض الصومال (revision 59456038)
|
||||
تلسكوب هابل الفضائي (revision 60058263)
|
||||
باتريون (revision 60085792)
|
||||
سفارة دولة فلسطين لدى النرويج (revision 52378186)
|
||||
يهود (revision 59707183)
|
||||
معركة ماس تييرا (revision 57430973)
|
||||
لجان المقاومة الشعبية (revision 59343775)
|
||||
ديفيد ماكليلاند (revision 57263809)
|
||||
تايكوندو (revision 59888517)
|
||||
مراسلات حسين - مكماهون (revision 60139105)
|
||||
هرم المناصب الروماني (revision 31639422)
|
||||
زينب بنت جحش (revision 59996782)
|
||||
مسجد الملك (بيرات) (revision 49377001)
|
||||
أسفار الأنبياء (revision 59221723)
|
||||
آنا إيفانوفنا إمبراطورة روسيا (revision 59518620)
|
||||
سرعة شريط التصوير الضوئي (revision 60022768)
|
||||
فيلم (revision 60028897)
|
||||
1885 (revision 57454888)
|
||||
جبت (revision 59552372)
|
||||
الشرق الأوسط الكبير (revision 60139256)
|
||||
أليكسي ريكوف (revision 59309449)
|
||||
1995 (revision 60100502)
|
||||
ثورات سواحل الأطلسي (revision 60135914)
|
||||
حرب (revision 59775414)
|
||||
مخطط القصة (revision 56758730)
|
||||
هيروهيتو (revision 58263851)
|
||||
علم وطني (revision 56004084)
|
||||
776 (revision 56920170)
|
||||
جورج دي بوفون (revision 60127133)
|
||||
غرفة سحابية (revision 60123675)
|
||||
موقع ويب (revision 60153516)
|
||||
فيلم روائي (revision 57665694)
|
||||
روبرت لانسينغ (revision 58523805)
|
||||
العلاقات الفيجية النيجيرية (revision 51111158)
|
||||
شعار سلطنة عمان (revision 59709848)
|
||||
فيصل الأول (revision 60086179)
|
||||
تشاد (revision 59975295)
|
||||
يمين مسيحي (revision 59807927)
|
||||
ابن خلدون (revision 60130285)
|
||||
اللغة الألمانية (revision 60024307)
|
||||
ستيفن لوبيز (revision 54996661)
|
||||
سان فرانسيسكو (revision 59605811)
|
||||
نيكولاي الأول (revision 60160741)
|
||||
الأرجنتين (revision 60091547)
|
||||
بانديرانتس (revision 59738382)
|
||||
مقياس الارتفاع (revision 60020176)
|
||||
إقليم أنتوفاغاستا (revision 51009451)
|
||||
31 يوليو (revision 59153451)
|
||||
ذو الكفل (revision 60020149)
|
||||
حكم الحزبين (revision 58689652)
|
||||
1911 (revision 60099079)
|
||||
قزم أسود (revision 54691902)
|
||||
أوتو ديلز (revision 57259077)
|
||||
سفارة دولة فلسطين لدى السودان (revision 51986389)
|
||||
تعددية عرقية (revision 60028505)
|
||||
متر مربع (revision 60024289)
|
||||
هيلاند هال (revision 54992619)
|
||||
24 يوليو (revision 58217047)
|
||||
أرجوان صور (revision 57562682)
|
||||
سياسة عامة (revision 59561456)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 21:49:25.799369
|
||||
|
||||
102 characters appeared 1350972 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char ا: 14.618659750165067 %
|
||||
[ 1] Char ل: 11.290019334227503 %
|
||||
[ 2] Char ي: 8.092173634982812 %
|
||||
[ 3] Char م: 6.058822832745609 %
|
||||
[ 4] Char و: 5.61795507234791 %
|
||||
[ 5] Char ن: 5.116464293856572 %
|
||||
[ 6] Char ر: 4.7137912554812385 %
|
||||
[ 7] Char ت: 4.240650435390222 %
|
||||
[ 8] Char ب: 3.6851985089254256 %
|
||||
[ 9] Char ة: 3.3367827016400042 %
|
||||
[10] Char ع: 3.1192356318265664 %
|
||||
[11] Char د: 2.9317409983330522 %
|
||||
[12] Char س: 2.6520164740645993 %
|
||||
[13] Char ف: 2.4876163236543762 %
|
||||
[14] Char ك: 2.095602277471332 %
|
||||
[15] Char ق: 2.0453421684535282 %
|
||||
[16] Char ه: 1.9926393737249921 %
|
||||
[17] Char أ: 1.9186185946118797 %
|
||||
[18] Char ح: 1.6858232443011403 %
|
||||
[19] Char ج: 1.265089135822208 %
|
||||
[20] Char ط: 1.0044619725649384 %
|
||||
[21] Char ص: 0.8890635779275959 %
|
||||
[22] Char ش: 0.8848443935181484 %
|
||||
[23] Char ى: 0.8022372040279148 %
|
||||
[24] Char خ: 0.7990543105260508 %
|
||||
[25] Char إ: 0.785360466390125 %
|
||||
[26] Char ث: 0.6229588770159559 %
|
||||
[27] Char ز: 0.5997903731535517 %
|
||||
[28] Char ض: 0.5947569601738599 %
|
||||
[29] Char ذ: 0.5238450537834981 %
|
||||
[30] Char غ: 0.479728669432083 %
|
||||
[31] Char ئ: 0.39675137604628374 %
|
||||
[32] Char ء: 0.3206580151180039 %
|
||||
[33] Char ظ: 0.22872420745951805 %
|
||||
[34] Char e: 0.16810118936587878 %
|
||||
[35] Char a: 0.14900382835469572 %
|
||||
[36] Char i: 0.1424159790136287 %
|
||||
[37] Char o: 0.12627944916697015 %
|
||||
[38] Char n: 0.12442892968914233 %
|
||||
[39] Char r: 0.11643468554492617 %
|
||||
[40] Char آ: 0.11132725178612139 %
|
||||
[41] Char ؤ: 0.10577569335263795 %
|
||||
[42] Char t: 0.10251877907166099 %
|
||||
[43] Char s: 0.0914156622046941 %
|
||||
[44] Char l: 0.08149687780353701 %
|
||||
[45] Char c: 0.054997438881042686 %
|
||||
[46] Char d: 0.050112067459577254 %
|
||||
[47] Char u: 0.04774340252795765 %
|
||||
[48] Char h: 0.04493061292165937 %
|
||||
[49] Char S: 0.03738049345212188 %
|
||||
[50] Char m: 0.03493780774138916 %
|
||||
[51] Char p: 0.03190295579775154 %
|
||||
[52] Char g: 0.029756353203471277 %
|
||||
[53] Char y: 0.02723964671362545 %
|
||||
[54] Char C: 0.026573459701607433 %
|
||||
[55] Char A: 0.025981293468702538 %
|
||||
[56] Char f: 0.023908711653535378 %
|
||||
[57] Char ـ: 0.02383469087442227 %
|
||||
[58] Char P: 0.022058192175707564 %
|
||||
[59] Char B: 0.021540046721915777 %
|
||||
[60] Char T: 0.020947880489010876 %
|
||||
[61] Char b: 0.020725818151671536 %
|
||||
[62] Char I: 0.019319423348522397 %
|
||||
[63] Char D: 0.01717282075424213 %
|
||||
|
||||
The first 64 characters have an accumulated ratio of 0.9979673894055535.
|
||||
The first 4 characters have an accumulated ratio of 0.40059675552120994.
|
||||
All characters whose order is over 29 have an accumulated ratio of 0.033261237094477154.
|
||||
|
||||
2046 sequences found.
|
||||
|
||||
First 1011 (typical positive ratio): 0.9950032726169761
|
||||
Next 415 (1426-1011): 0.004000386891877872
|
||||
Rest: 0.0009963404911460527
|
||||
|
||||
- Processing end: 2022-12-14 21:49:26.063356
|
||||
240
script/BuildLangModelLogs/LangBelarusianModel.log
Normal file
240
script/BuildLangModelLogs/LangBelarusianModel.log
Normal file
@ -0,0 +1,240 @@
|
||||
= Logs of language model for Belarusian (be) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-17 18:45:44.158196
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Максім_Танк (revision 4282144)
|
||||
Польская Рэспубліка (1918—1939) (revision 4280541)
|
||||
Літаратурная прэмія імя Янкі Купалы (revision 4017964)
|
||||
Мядзел (revision 4262814)
|
||||
Каралеўская бібліятэка Швецыі (revision 4114661)
|
||||
Калоссе (1935) (revision 3858825)
|
||||
Наша Ніва (1991) (revision 4100218)
|
||||
Леанід Уладзіміравіч Маракоў (revision 4053060)
|
||||
Дзмітрый Браніслававіч Смольскі (revision 4282791)
|
||||
Васіль Філімонавіч Шавура (revision 3397335)
|
||||
Леанід Дранько-Майсюк (revision 4280504)
|
||||
Рэферэндум у Беларусі, 1995 (revision 4133742)
|
||||
Дзіцячая літаратура (revision 4215153)
|
||||
1990 (revision 3826851)
|
||||
Барысаў (паэма) (revision 3675556)
|
||||
Часопіс (revision 4062833)
|
||||
1940 (revision 4141940)
|
||||
Літаратурная прэмія імя Якуба Коласа (revision 3790577)
|
||||
Мікалай Дамашкевіч (revision 4124871)
|
||||
Паўночная Інгрыя (revision 4022023)
|
||||
Антон Браніслававіч Насілоўскі (revision 3575651)
|
||||
Джэймс Фенімар Купер (revision 3516371)
|
||||
Ханс Крысціян Андэрсен (revision 3845458)
|
||||
Virtual International Authority File (revision 4119042)
|
||||
1912 (revision 4201938)
|
||||
Кампазітар (revision 4086673)
|
||||
Парламенцкія выбары ў Беларусі (2012) (revision 4056679)
|
||||
Янка Купала (revision 4297880)
|
||||
Уладзімір Някляеў (revision 4061577)
|
||||
TUT.BY (revision 4254319)
|
||||
Гарады Мінскай вобласці (revision 4290488)
|
||||
Вільня (revision 4260328)
|
||||
Брэсцкая вобласць (revision 4095450)
|
||||
Украінская мова (revision 4281826)
|
||||
Сацыяльная сетка (revision 3501794)
|
||||
Джонатан Свіфт (revision 4047140)
|
||||
Мікалай Аляксеевіч Някрасаў (revision 4054879)
|
||||
Наша ніва (1920) (revision 3648798)
|
||||
Белсат (revision 4295169)
|
||||
Рэспубліка Сярэдняй Літвы (revision 4287459)
|
||||
Беларусь (revision 4283834)
|
||||
Віктар Дзмітрыевіч Смольскі (revision 3829868)
|
||||
Генадзь Пятровіч Пашкоў (revision 4254449)
|
||||
Знешняя палітыка Беларусі (revision 4258993)
|
||||
2012 (revision 4181555)
|
||||
Саюз пісьменнікаў СССР (revision 4039027)
|
||||
1995 (revision 3568939)
|
||||
Саюз Савецкіх Сацыялістычных Рэспублік (revision 3433404)
|
||||
Сістэма абазначэння аб’ектаў адміністрацыйна-тэрытарыяльнага падзелу (revision 2873336)
|
||||
XVIII (revision 4286695)
|
||||
Я. Шутовіч (revision 4063897)
|
||||
Заходняя Беларусь (revision 4189742)
|
||||
1958 (revision 4179116)
|
||||
Санкт-Пецярбург (revision 4297788)
|
||||
Аляксандр Паўлавіч Мацвееў (revision 4002064)
|
||||
Жодзіна (revision 4204566)
|
||||
Узда (revision 4263659)
|
||||
Анатоль Васільевіч Багатыроў (revision 4045167)
|
||||
Людміла Рублеўская (revision 4109306)
|
||||
Беларускі ПЭН-цэнтр (revision 4256051)
|
||||
Дзяржаўны літаратурны музей Янкі Купалы (revision 4258654)
|
||||
Пётр Паўлавіч Яршоў (revision 3212130)
|
||||
Выбары ў мясцовыя Саветы дэпутатаў Беларусі (1995) (revision 3419938)
|
||||
Беластоцкае ваяводства (1919—1939) (revision 4167163)
|
||||
Уладзімір Аляксеевіч Арлоў (revision 3996298)
|
||||
1948 (revision 4298274)
|
||||
1998 (revision 4169162)
|
||||
Андрэй Катлярчук (revision 4045257)
|
||||
Аляксей Камай (revision 4004900)
|
||||
БелаПАН (revision 4114047)
|
||||
Літаратура і мастацтва (1932) (revision 4226134)
|
||||
18 стагоддзе (revision 4286695)
|
||||
Якуб Колас (revision 4276306)
|
||||
6 лістапада (revision 4241889)
|
||||
Масква (revision 4293280)
|
||||
Столінскі раён (revision 4126133)
|
||||
БелТА (revision 4114101)
|
||||
Беларуская Энцыклапедыя імя Петруся Броўкі (revision 4131649)
|
||||
11 красавіка (revision 4257199)
|
||||
Беларуская мова (revision 4156511)
|
||||
Маскоўская кансерваторыя (revision 3240079)
|
||||
Ягор Аляксандравіч Марціновіч (revision 4224289)
|
||||
Фінляндыя (revision 4214425)
|
||||
Мікола Мятліцкі (revision 4283301)
|
||||
Народны артыст Беларусі (revision 4275698)
|
||||
Курган (паэма) (revision 4086218)
|
||||
Старыя Дарогі (revision 4204749)
|
||||
Слуцк (revision 4278680)
|
||||
Каралеўства Польскае, 1916—1918 (revision 4288202)
|
||||
Залаты апостраф (revision 4111782)
|
||||
Уладзімір Андрэевіч Калеснік (revision 4074048)
|
||||
Залатая літара (revision 4085127)
|
||||
Нарматыўны кантроль (revision 4228063)
|
||||
Вышэйшы Гаспадарчы суд Рэспублікі Беларусь (revision 4215415)
|
||||
Euronews (revision 4165755)
|
||||
2003 (revision 4206607)
|
||||
Л. Маракоў (revision 4053060)
|
||||
1957 (revision 4170762)
|
||||
Слуцкі раён (revision 4145373)
|
||||
Дзяржаўная прэмія БССР (revision 3316889)
|
||||
Энцыклапедыя гісторыі Беларусі (revision 4030685)
|
||||
Звязда (1917) (revision 4008703)
|
||||
2018 (revision 4289036)
|
||||
1986 (revision 3316291)
|
||||
Бухарская Народная Савецкая Рэспубліка (revision 2623266)
|
||||
1939 (revision 4148673)
|
||||
Дзеяслоў (2002) (revision 4049427)
|
||||
Руская мова (revision 4214240)
|
||||
Рафаэла Джаваньёлі (revision 2633449)
|
||||
Фёдар Анісімавіч Сурганаў (revision 4188740)
|
||||
Саюз пісьменнікаў Беларусі (2005) (revision 4262467)
|
||||
Беларуская дзяржаўная кансерваторыя (revision 4216964)
|
||||
2001 (revision 4204274)
|
||||
Ігнацы Масціцкі (revision 4002826)
|
||||
29 верасня (revision 4261890)
|
||||
Іван Андрэевіч Крылоў (revision 3874970)
|
||||
М. Шкялёнак (revision 4197856)
|
||||
1935 (revision 3316357)
|
||||
Тутэйшыя (фільм, 1993) (revision 3952769)
|
||||
Майскі пераварот (Польшча) (revision 2832232)
|
||||
Лацвянскі сельсавет (revision 3562080)
|
||||
Генадзь Пашкоў (revision 4254449)
|
||||
Сырмежскі сельсавет (revision 4077910)
|
||||
13 красавіка (revision 4201937)
|
||||
1994 (revision 4170911)
|
||||
Рабінавая ноч (revision 4262182)
|
||||
5 ліпеня (revision 4148432)
|
||||
Беларускае Палессе (revision 4277908)
|
||||
Віцебская вобласць (revision 4257032)
|
||||
Саюз вызвалення Беларусі (справа) (revision 4152192)
|
||||
Навагрудак (revision 4283306)
|
||||
Расійская дзяржава (1918—1920) (revision 4070494)
|
||||
Утварэнне Вялікага Княства Літоўскага (revision 4281842)
|
||||
Коўна (revision 4264967)
|
||||
1989 (revision 4170810)
|
||||
7 сакавіка (revision 4272372)
|
||||
Советская Белоруссия (revision 3941699)
|
||||
Саксафон (revision 4119817)
|
||||
Вікісховішча (revision 4276248)
|
||||
1915 (revision 4204295)
|
||||
Канстытуцыя Рэспублікі Беларусь (revision 4051195)
|
||||
Расійская імперыя (revision 4273900)
|
||||
1930-я (revision 3508427)
|
||||
Доктар гістарычных навук (revision 4036548)
|
||||
Віленскае ваяводства, 1926—1939 (revision 4010285)
|
||||
1767 (revision 4119132)
|
||||
Салігорск (revision 4285782)
|
||||
Горад (revision 4154288)
|
||||
Заходні Берлін (revision 4273163)
|
||||
Калійныя солі (revision 3812964)
|
||||
Паштовы індэкс (revision 2680497)
|
||||
1714 (revision 3317887)
|
||||
Джордж Харысан (revision 4129049)
|
||||
Аўстра-Венгрыя (revision 3868613)
|
||||
Антарктыка (revision 3997579)
|
||||
1956 (revision 4169991)
|
||||
Люфтвафэ (revision 3726645)
|
||||
Канстытуцыя Украіны (revision 2683533)
|
||||
Васіль Уладзіміравіч Быкаў (revision 4288405)
|
||||
1698 (revision 3448249)
|
||||
Бяларучы (revision 4294726)
|
||||
1950-я (revision 4204989)
|
||||
Выбаргскі раён (Ленінградская вобласць) (revision 3641710)
|
||||
Нацыянальная парламенцкая бібліятэка Японіі (revision 4020527)
|
||||
1934 (revision 4275604)
|
||||
XIX стагоддзе (revision 4286738)
|
||||
24 студзеня (revision 4268404)
|
||||
Вільнюскае гарадское самакіраванне (revision 3492972)
|
||||
Залаты Купідон (revision 4267601)
|
||||
Мінская вобласць (revision 4296852)
|
||||
Кантрольны нумар Бібліятэкі Кангрэса (revision 3491858)
|
||||
Пінск (revision 4286890)
|
||||
Нацыянальная бібліятэка Францыі (revision 4267432)
|
||||
10 студзеня (revision 3935845)
|
||||
Аляксандр Іванавіч Якімовіч (revision 4085685)
|
||||
Лужаснянскі дзяржаўны аграрны каледж імя Ф. А. Сурганава (revision 4171547)
|
||||
Рыта Леві-Мантальчыні (revision 4058476)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-17 18:49:26.830622
|
||||
|
||||
65 characters appeared 853773 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char а: 15.572991884259633 %
|
||||
[ 1] Char н: 6.632324985681206 %
|
||||
[ 2] Char і: 5.7941630855039925 %
|
||||
[ 3] Char р: 5.325888731548082 %
|
||||
[ 4] Char с: 5.02124100902699 %
|
||||
[ 5] Char к: 4.3536162422564315 %
|
||||
[ 6] Char ы: 4.066654719697156 %
|
||||
[ 7] Char л: 4.051428189928704 %
|
||||
[ 8] Char е: 3.6824776609239227 %
|
||||
[ 9] Char т: 3.4540797143971527 %
|
||||
[10] Char я: 3.1694607348791775 %
|
||||
[11] Char в: 3.116285007841663 %
|
||||
[12] Char д: 3.1063291999161367 %
|
||||
[13] Char о: 2.9540639022316237 %
|
||||
[14] Char у: 2.9458650015870727 %
|
||||
[15] Char м: 2.709385281567817 %
|
||||
[16] Char п: 2.6671023796723485 %
|
||||
[17] Char з: 2.22483025347487 %
|
||||
[18] Char ц: 1.998657722837335 %
|
||||
[19] Char г: 1.9463018858642753 %
|
||||
[20] Char ў: 1.9429051984543901 %
|
||||
[21] Char б: 1.610732595198021 %
|
||||
[22] Char э: 1.3249423441593962 %
|
||||
[23] Char ч: 1.3172119521231052 %
|
||||
[24] Char й: 1.1151676148109626 %
|
||||
[25] Char ь: 1.0356382785588207 %
|
||||
[26] Char х: 0.9220249410557607 %
|
||||
[27] Char ш: 0.7558215122755112 %
|
||||
[28] Char ж: 0.5403075524758923 %
|
||||
[29] Char ю: 0.4688599897162361 %
|
||||
[30] Char ф: 0.3941328666987595 %
|
||||
[31] Char i: 0.36391406146598687 %
|
||||
[32] Char e: 0.2897725742088354 %
|
||||
[33] Char ё: 0.28508748812623497 %
|
||||
|
||||
The first 34 characters have an accumulated ratio of 0.9715966656242353.
|
||||
The first 6 characters have an accumulated ratio of 0.42700225938276326.
|
||||
All characters whose order is over 26 have an accumulated ratio of 0.030978960449674565.
|
||||
|
||||
1518 sequences found.
|
||||
|
||||
First 893 (typical positive ratio): 0.9950100888151092
|
||||
Next 272 (1165-893): 0.003995003102100991
|
||||
Rest: 0.0009949080827897916
|
||||
|
||||
- Processing end: 2022-12-17 18:49:26.928946
|
||||
263
script/BuildLangModelLogs/LangBulgarianModel.log
Normal file
263
script/BuildLangModelLogs/LangBulgarianModel.log
Normal file
@ -0,0 +1,263 @@
|
||||
= Logs of language model for Bulgarian (bg) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-17 18:13:39.705509
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Амурски_леопард (revision 11479353)
|
||||
Пектусан (revision 11051736)
|
||||
Тъкан (revision 11413541)
|
||||
Растителноядно животно (revision 9401552)
|
||||
Козмин (залив) (revision 10801896)
|
||||
Око (revision 11307426)
|
||||
Руска кухня (revision 8912349)
|
||||
Обединена система за таксономична информация (revision 10952587)
|
||||
Лисица (revision 11570875)
|
||||
Сихоте Алин (revision 10913633)
|
||||
Шриланкски леопард (revision 11478652)
|
||||
Фазан (revision 11554738)
|
||||
Северна Корея (revision 11596651)
|
||||
Протисти (revision 11599945)
|
||||
Калдера (revision 10605482)
|
||||
Месо (revision 11396435)
|
||||
Мезозойска ера (revision 11406482)
|
||||
Тамилски (revision 11536357)
|
||||
Птици (revision 11599947)
|
||||
Паразитизъм (revision 10905879)
|
||||
Череп (revision 11382448)
|
||||
Домати (revision 11568692)
|
||||
Гъби (revision 11575731)
|
||||
Връх (revision 11560584)
|
||||
Хабаровски край (revision 11326255)
|
||||
Слъзна жлеза (revision 9848117)
|
||||
Клетка (биология) (revision 11599652)
|
||||
Чанбайшан (revision 11436397)
|
||||
Усури (revision 11485897)
|
||||
Нормативен контрол (revision 11218813)
|
||||
Phasianus (revision 11554738)
|
||||
Перм (период) (revision 10376629)
|
||||
Въздух (revision 11586473)
|
||||
Растения (revision 11599967)
|
||||
Лов (revision 11549760)
|
||||
Култ към личността (revision 11309525)
|
||||
Биология (revision 11597684)
|
||||
Азиатска късоноктеста видра (revision 11530864)
|
||||
Ротатории (revision 10164408)
|
||||
Торонто (revision 11500811)
|
||||
Епител (revision 11544065)
|
||||
Животни (revision 11599450)
|
||||
Animal Diversity Web (revision 11280365)
|
||||
Главоноги (revision 11321675)
|
||||
Новозеландски морски лъв (revision 11531150)
|
||||
Общомедия (revision 11583644)
|
||||
Яйцеклетка (revision 11574210)
|
||||
Риба (revision 11602135)
|
||||
Ялуцзян (revision 11616897)
|
||||
Водорасли (revision 11589165)
|
||||
Тигрова генета (revision 11532904)
|
||||
Карбон (revision 11440434)
|
||||
Енотовидно куче (revision 11530902)
|
||||
Пинин (revision 10953442)
|
||||
Морска видра (revision 11022765)
|
||||
Коткови (revision 11296822)
|
||||
Сметана (revision 10602821)
|
||||
Просо (revision 10908234)
|
||||
Корейски полуостров (revision 11532552)
|
||||
Уикивидове (revision 9824200)
|
||||
Източна Азия (revision 10984512)
|
||||
Злато (revision 11601280)
|
||||
Лист (revision 11417909)
|
||||
Уикиданни (revision 10288984)
|
||||
Персийски леопард (revision 10731068)
|
||||
Vormela (revision 11531190)
|
||||
Африкански леопард (revision 10671790)
|
||||
Далечен изток (revision 10098481)
|
||||
Индийски леопард (revision 10949302)
|
||||
Червен списък на световнозастрашените видове (revision 10923987)
|
||||
Елда (revision 11398540)
|
||||
Латински език (revision 11610275)
|
||||
Николай Пржевалски (revision 11378214)
|
||||
Корейски език (revision 11585784)
|
||||
Цитоплазма (revision 10815311)
|
||||
Силур (revision 10913196)
|
||||
Дърво (revision 11599411)
|
||||
Амур (revision 11232524)
|
||||
Оцет (revision 10974969)
|
||||
Индийски солонгой (revision 11530605)
|
||||
Креда (revision 11194691)
|
||||
BBC News (revision 11556539)
|
||||
Ендодерма (revision 10159731)
|
||||
Система на Маккюн-Райшауер (revision 10199499)
|
||||
Вол (revision 11486361)
|
||||
Тумънцзян (revision 11405669)
|
||||
Тайга (revision 11596057)
|
||||
Паренхим (revision 9238563)
|
||||
Бикин (река) (revision 10416126)
|
||||
Национален център за биотехнологична информация на САЩ (revision 10901368)
|
||||
Кокошоподобни (revision 11377806)
|
||||
Телевизор (revision 11587645)
|
||||
Влажност (revision 11587428)
|
||||
Анатолийски леопард (revision 10986842)
|
||||
Синайски леопард (revision 10737955)
|
||||
Акомодация (revision 9073034)
|
||||
Бульон (revision 9265335)
|
||||
Мляко (revision 11599803)
|
||||
Хранителна верига (revision 9990974)
|
||||
Китайски език (revision 11315056)
|
||||
Мъжки (revision 11120791)
|
||||
Камбрий (revision 10117802)
|
||||
Зигота (revision 10544543)
|
||||
Листо (revision 11417909)
|
||||
Кромид лук (revision 10698110)
|
||||
Хрян (revision 11494398)
|
||||
Ектодерма (revision 10806725)
|
||||
Храст (revision 11500525)
|
||||
Геология (revision 11598573)
|
||||
Дългоопашат скункс (revision 11531277)
|
||||
Лигавица (revision 10894252)
|
||||
Горчица (revision 8753833)
|
||||
Подковонос на Мехели (revision 10377709)
|
||||
Бозайници (revision 11597688)
|
||||
Кванмьонсон-1 (revision 11507924)
|
||||
Азиатска палмова цивета (revision 11531312)
|
||||
Хранителни вещества (revision 11590475)
|
||||
Дмитрий Орлов (revision 10880810)
|
||||
Въглероден диоксид (revision 10769242)
|
||||
Ракообразни (revision 11349934)
|
||||
Испански език (revision 11599556)
|
||||
Уикиречник (revision 9194836)
|
||||
Уретра (revision 11600909)
|
||||
ISO 639 (revision 10477132)
|
||||
Биологична система (revision 10872761)
|
||||
Палеозой (revision 10972967)
|
||||
Розетка (revision 11250355)
|
||||
Ихтиозаври (revision 11141622)
|
||||
Хабаровск (revision 11427125)
|
||||
Хавайски тюлен монах (revision 11531012)
|
||||
Кодкод (revision 11480480)
|
||||
Южна Европа (revision 10119488)
|
||||
Вода (revision 11606762)
|
||||
URL (revision 11283400)
|
||||
Ивичест зурлест скункс (revision 11476684)
|
||||
Храносмилателна система (revision 11298271)
|
||||
Триас (revision 10657489)
|
||||
ООН (revision 11599875)
|
||||
Alexa Internet (revision 11547819)
|
||||
Псориазис (revision 11607604)
|
||||
Партеногенеза (revision 11201489)
|
||||
Картоф (revision 11611083)
|
||||
Коприва (revision 11416720)
|
||||
Воден плъх (revision 11351201)
|
||||
Прилепи (revision 11566273)
|
||||
Odobenidae (revision 11032101)
|
||||
Гондвана (revision 11074999)
|
||||
Домашна муха (revision 11484479)
|
||||
Трахея (revision 11408131)
|
||||
Безполово размножаване (revision 10972108)
|
||||
Карибски регион (revision 10503045)
|
||||
Географска координатна система (revision 10929840)
|
||||
Entoprocta (revision 10346607)
|
||||
Бадем (revision 11339812)
|
||||
Удил (revision 10422385)
|
||||
Южноафриканска морска котка (revision 11476346)
|
||||
Библиотечно дело (revision 11477309)
|
||||
Организъм (revision 11079762)
|
||||
Животно (revision 11599450)
|
||||
Донг Фанг Хонг I (revision 11537199)
|
||||
Палеоген (revision 9895031)
|
||||
Триптофан (revision 11566722)
|
||||
Боливия (revision 11584461)
|
||||
Суспензия (revision 11306702)
|
||||
Chlorophyceae (revision 11097610)
|
||||
Тетраподоморфи (revision 10796558)
|
||||
Wayback Machine (revision 11423066)
|
||||
Mustelidae (revision 10988654)
|
||||
Епителна тъкан (revision 11544065)
|
||||
Чернолапа котка (revision 11545586)
|
||||
Уралски федерален окръг (revision 11412555)
|
||||
Северна Африка (revision 11617946)
|
||||
Корейски архипелаг (revision 11436736)
|
||||
Златна палмова цивета (revision 11530618)
|
||||
Макроелемент (revision 11151625)
|
||||
Международен съюз за защита на природата (revision 11546091)
|
||||
Пролетен горицвет (revision 11560104)
|
||||
Име (revision 11387941)
|
||||
Neophoca (revision 11552636)
|
||||
Алвеола (revision 10429710)
|
||||
Лападови (revision 9926969)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-17 18:16:58.793948
|
||||
|
||||
59 characters appeared 866927 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char а: 11.195290952986813 %
|
||||
[ 1] Char и: 9.90394808328729 %
|
||||
[ 2] Char о: 8.887830232533997 %
|
||||
[ 3] Char е: 8.05834862681633 %
|
||||
[ 4] Char т: 7.773895610587743 %
|
||||
[ 5] Char н: 7.376976377480457 %
|
||||
[ 6] Char р: 5.300561638984598 %
|
||||
[ 7] Char с: 4.85496472021289 %
|
||||
[ 8] Char в: 4.23022930419747 %
|
||||
[ 9] Char л: 3.41978044287466 %
|
||||
[10] Char к: 3.3481481139703804 %
|
||||
[11] Char д: 2.8882477994110234 %
|
||||
[12] Char п: 2.700227354783044 %
|
||||
[13] Char з: 2.255207185841484 %
|
||||
[14] Char м: 2.1408953695063135 %
|
||||
[15] Char я: 1.6356625182973883 %
|
||||
[16] Char ъ: 1.4382987264210252 %
|
||||
[17] Char г: 1.3491332026802718 %
|
||||
[18] Char ч: 1.2814227726209935 %
|
||||
[19] Char у: 1.267234726799373 %
|
||||
[20] Char б: 1.132852016375081 %
|
||||
[21] Char ж: 0.7340871838113243 %
|
||||
[22] Char ц: 0.6595711057563094 %
|
||||
[23] Char х: 0.5456053393192275 %
|
||||
[24] Char й: 0.5091547500539261 %
|
||||
[25] Char a: 0.437522421149647 %
|
||||
[26] Char ф: 0.37927068830478233 %
|
||||
[27] Char щ: 0.3754641394258109 %
|
||||
[28] Char i: 0.342589399107422 %
|
||||
[29] Char e: 0.3205575555957999 %
|
||||
[30] Char o: 0.3129444578378571 %
|
||||
[31] Char ш: 0.27326406952373156 %
|
||||
[32] Char r: 0.25757647414372836 %
|
||||
[33] Char n: 0.24073537910343085 %
|
||||
[34] Char s: 0.236006030496224 %
|
||||
[35] Char t: 0.23069993205887002 %
|
||||
[36] Char c: 0.2030159402118056 %
|
||||
[37] Char l: 0.19990149112901087 %
|
||||
[38] Char m: 0.16322020193165054 %
|
||||
[39] Char u: 0.1605671527129735 %
|
||||
[40] Char ю: 0.1558378041057667 %
|
||||
[41] Char p: 0.12861521212282004 %
|
||||
[42] Char d: 0.12065606446678902 %
|
||||
[43] Char h: 0.11258156684472856 %
|
||||
[44] Char b: 0.07832262693398637 %
|
||||
[45] Char y: 0.07059417921001422 %
|
||||
[46] Char g: 0.07047882924398478 %
|
||||
[47] Char k: 0.053637734203687275 %
|
||||
[48] Char f: 0.052368884577363495 %
|
||||
[49] Char v: 0.04060318804236112 %
|
||||
[50] Char w: 0.024108142900151914 %
|
||||
[51] Char x: 0.022493243375739824 %
|
||||
[52] Char ь: 0.01799459470059186 %
|
||||
|
||||
The first 53 characters have an accumulated ratio of 0.9996920155907014.
|
||||
The first 5 characters have an accumulated ratio of 0.4581931350621217.
|
||||
All characters whose order is over 29 have an accumulated ratio of 0.03226223199877268.
|
||||
|
||||
1236 sequences found.
|
||||
|
||||
First 720 (typical positive ratio): 0.9950164618425456
|
||||
Next 201 (921-720): 0.003986830525963603
|
||||
Rest: 0.0009967076314908452
|
||||
|
||||
- Processing end: 2022-12-17 18:16:58.922580
|
||||
238
script/BuildLangModelLogs/LangCatalanModel.log
Normal file
238
script/BuildLangModelLogs/LangCatalanModel.log
Normal file
@ -0,0 +1,238 @@
|
||||
= Logs of language model for Catalan (ca) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-20 01:31:40.290803
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Parlament_Europeu (revision 31056370)
|
||||
Genji Monogatari (revision 31007904)
|
||||
Bundestag (revision 30742728)
|
||||
Kana (revision 29176811)
|
||||
Jun'ichirō Tanizaki (revision 30750244)
|
||||
Representació proporcional amb llista de partit (revision 22086795)
|
||||
Agències de la Unió Europea (revision 30276199)
|
||||
Poder executiu (revision 30290834)
|
||||
Edicions Atalanta (revision 26048077)
|
||||
Animació (revision 30865051)
|
||||
Pressupost de la Unió Europea (revision 30231577)
|
||||
Jorge Luis Borges (revision 30783720)
|
||||
Universitat de Pittsburgh (revision 25411555)
|
||||
Satiricó (revision 31019009)
|
||||
Dramatis personae (revision 30858787)
|
||||
Corpus lingüístic (revision 28600087)
|
||||
Genji Monogatari Emaki (revision 30520718)
|
||||
Era Keichō (revision 27881416)
|
||||
Període Heian (revision 30351338)
|
||||
Uji (revision 26298733)
|
||||
Clan Minamoto (revision 29218047)
|
||||
Ventafocs (revision 30167478)
|
||||
わ (revision 28487155)
|
||||
Japó (revision 30980338)
|
||||
Agència Europea dels Sistemes Globals de Navegació per Satèl·lit (revision 28777516)
|
||||
Període Shōwa (revision 30351346)
|
||||
ム (revision 25190709)
|
||||
Premi Balzan (revision 30321993)
|
||||
Germans Grimm (revision 30104486)
|
||||
Europol (revision 25369380)
|
||||
Unió Europea (revision 30730061)
|
||||
Kyoto (revision 30706119)
|
||||
Incendi del Reichstag (revision 30894126)
|
||||
Processament de llenguatge natural (revision 29016655)
|
||||
794 (revision 29283769)
|
||||
CANTIC (revision 30488826)
|
||||
Casa de la Història Europea (revision 30703943)
|
||||
VP:VER (revision 30232565)
|
||||
Katakana (revision 29937701)
|
||||
Shogunat Kamakura (revision 28808156)
|
||||
Eleccions (revision 30449311)
|
||||
Noam Chomsky (revision 30552025)
|
||||
Eleccions federals alemanyes de 1994 (revision 28337358)
|
||||
Conceptes d'unitat europea abans del 1945 (revision 30927921)
|
||||
Era Heian (revision 30351338)
|
||||
Gemeinsame Normdatei (revision 30883432)
|
||||
La Bella Dorment (pel·lícula de 1959) (revision 30982067)
|
||||
Població (revision 30352350)
|
||||
Obra literària (revision 31011396)
|
||||
も (revision 25190714)
|
||||
Istituto Centrale per il Catalogo Unico (revision 28786509)
|
||||
Política (revision 31014511)
|
||||
ハ (revision 31071577)
|
||||
Vot (revision 27865452)
|
||||
Clan Taira (revision 26323649)
|
||||
Permís de conducció europeu (revision 27672810)
|
||||
Mandala (revision 30940608)
|
||||
Campània antiga (revision 29855854)
|
||||
ゐ (revision 28487156)
|
||||
Consell de la Unió Europea (revision 30308594)
|
||||
24 de juliol (revision 31063555)
|
||||
Kyōto (revision 30706119)
|
||||
Alfons X de Castella (revision 30535714)
|
||||
VIAF (revision 28927187)
|
||||
1975 (revision 31057077)
|
||||
モ (revision 25190714)
|
||||
Sistema Galileo (revision 30880731)
|
||||
Densitat de població (revision 30174278)
|
||||
Autodesk Maya (revision 30989692)
|
||||
Nàpols (revision 31028649)
|
||||
Memòria de traducció (revision 30341759)
|
||||
Ryukyu (revision 29922259)
|
||||
Agència Europea per a la Seguretat i la Salut en el Treball (revision 29049313)
|
||||
ISNI (revision 30824306)
|
||||
PDF (revision 29442049)
|
||||
Eleccions federals alemanyes de 1972 (revision 30271501)
|
||||
Sistema presidencialista (revision 30596011)
|
||||
Primer ministre (revision 27174693)
|
||||
Coeducació (revision 31048027)
|
||||
Ko Tazawa (revision 30932179)
|
||||
Poliomielitis (revision 30976061)
|
||||
18 de setembre (revision 31063494)
|
||||
Campanya electoral (revision 27935270)
|
||||
Kōbō Abe (revision 30016508)
|
||||
Rodopis (revision 28014188)
|
||||
Política Agrària Comunitària (revision 30353551)
|
||||
21 d'octubre (revision 30980460)
|
||||
1984 (revision 31063521)
|
||||
South Park (revision 31024165)
|
||||
Hiragana (revision 29920075)
|
||||
Associació de Votants de Schleswig Meridional (revision 30753058)
|
||||
ひ (revision 31071564)
|
||||
Lingüística (revision 31037031)
|
||||
Blauet comú (revision 28729161)
|
||||
Autodeterminació (revision 29349294)
|
||||
Xina (revision 31007838)
|
||||
Control d'autoritats (revision 29854505)
|
||||
Guillermo de Torre (revision 30765552)
|
||||
Unesco (revision 30129516)
|
||||
Romanització Hepburn (revision 29144432)
|
||||
Tanka (revision 30478859)
|
||||
Clientelisme (revision 30811663)
|
||||
Corpus Textual Informatitzat de la Llengua Catalana (revision 29876775)
|
||||
Secessió (revision 29980781)
|
||||
Fada protectora (revision 29175001)
|
||||
を (revision 28487157)
|
||||
Ōtsu (revision 30010938)
|
||||
Gran Enciclopèdia Catalana (revision 30724375)
|
||||
LCCN (revision 30638965)
|
||||
Universitat privada (revision 28518823)
|
||||
Robert Louis Stevenson (revision 30728093)
|
||||
Kioto (revision 30706119)
|
||||
7 de setembre (revision 30503878)
|
||||
Aardman Animations (revision 30216975)
|
||||
Llibertinatge (revision 29597307)
|
||||
Bibliothèque nationale de France (revision 30715383)
|
||||
Alemanya Occidental (revision 30239917)
|
||||
National Library of Australia (revision 30977078)
|
||||
Diccionari Descriptiu de la Llengua Catalana (revision 27017217)
|
||||
1969 (revision 31060188)
|
||||
Separació de poders (revision 30362225)
|
||||
Isaac Titsingh (revision 29748956)
|
||||
Adolf Hitler (revision 30951478)
|
||||
Període Kamakura (revision 28808156)
|
||||
Societas Europaea (revision 28857120)
|
||||
Invasions japoneses a Corea (revision 30978745)
|
||||
Agència de la Unió Europea (revision 30276199)
|
||||
Sistema polític (revision 30713673)
|
||||
1606 (revision 26237152)
|
||||
Universitat Rovira i Virgili (revision 30865280)
|
||||
IVA (revision 30328630)
|
||||
Patricis (revision 30923152)
|
||||
Els barrufets (revision 31008031)
|
||||
Lapislàtzuli Editorial (revision 30176117)
|
||||
Internet (revision 30894405)
|
||||
BIBSYS (revision 30255267)
|
||||
Agència Europea de Seguretat Marítima (revision 28888118)
|
||||
National Diet Library (revision 30669422)
|
||||
Grup Enciclopèdia Catalana (revision 31077222)
|
||||
Competència comunicativa (revision 30307632)
|
||||
Castell Fushimi (revision 30610308)
|
||||
Walter Gropius (revision 30790098)
|
||||
Biblioteca Nacional de España (revision 31071591)
|
||||
Diccionari Normatiu Valencià (revision 29882403)
|
||||
Oscar Wilde (revision 31078983)
|
||||
Hampshire (revision 30823098)
|
||||
Clan Fujiwara (revision 30894950)
|
||||
Speedy Gonzales (revision 30151280)
|
||||
Tlön, Uqbar, Orbis Tertius (revision 29688246)
|
||||
Japó ocupat (revision 28083159)
|
||||
Garbancito de la Mancha (revision 30219073)
|
||||
SUDOC (revision 29231585)
|
||||
Gerardo Diego (revision 29912471)
|
||||
Universitat (revision 29907980)
|
||||
Foliscopi (revision 29903436)
|
||||
1980 (revision 31063457)
|
||||
Infart de miocardi (revision 30894255)
|
||||
Encyclopædia Britannica (revision 28347959)
|
||||
Petroni (revision 29790499)
|
||||
Horari de màxima audiència (revision 27872454)
|
||||
Sutra (revision 23458427)
|
||||
Medicina (revision 31002196)
|
||||
ホ (revision 25190705)
|
||||
Luci Appuleu (revision 30336717)
|
||||
Novel·la (revision 30386814)
|
||||
Kimba, el lleó blanc (revision 30273901)
|
||||
UTC+09:00 (revision 25182859)
|
||||
Arquitectura neogòtica (revision 30347122)
|
||||
Segle I (revision 30953541)
|
||||
Emperador del Japó (revision 27799841)
|
||||
Biblioteca Nacional de la República Txeca (revision 29847950)
|
||||
Gran Diccionari de la Llengua Catalana (revision 29063719)
|
||||
Període Reiwa (revision 29227861)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-20 01:34:38.734771
|
||||
|
||||
57 characters appeared 1339831 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 12.524042211293812 %
|
||||
[ 1] Char a: 11.715955221218199 %
|
||||
[ 2] Char i: 7.815090112111155 %
|
||||
[ 3] Char s: 7.809940208877089 %
|
||||
[ 4] Char r: 6.866686917976969 %
|
||||
[ 5] Char n: 6.706069646097157 %
|
||||
[ 6] Char l: 6.58105387918327 %
|
||||
[ 7] Char t: 6.268850325152949 %
|
||||
[ 8] Char o: 5.046308079153267 %
|
||||
[ 9] Char c: 4.242027539294135 %
|
||||
[10] Char d: 4.013192708632656 %
|
||||
[11] Char u: 3.5825413802188484 %
|
||||
[12] Char m: 3.048966623402504 %
|
||||
[13] Char p: 2.778783294310999 %
|
||||
[14] Char g: 1.4824257686230575 %
|
||||
[15] Char v: 1.3498717375549603 %
|
||||
[16] Char b: 1.2941184373253045 %
|
||||
[17] Char f: 0.975943980994618 %
|
||||
[18] Char q: 0.7455417884792933 %
|
||||
[19] Char h: 0.6949383914837021 %
|
||||
[20] Char ó: 0.5910446914573555 %
|
||||
[21] Char x: 0.5195431364104875 %
|
||||
[22] Char é: 0.4443097674258918 %
|
||||
[23] Char à: 0.3875115592936721 %
|
||||
[24] Char j: 0.36474749427353154 %
|
||||
[25] Char y: 0.3636279500922131 %
|
||||
[26] Char è: 0.3583287743006394 %
|
||||
[27] Char í: 0.3250409939761059 %
|
||||
[28] Char k: 0.2481656268589098 %
|
||||
[29] Char ò: 0.21577348187943107 %
|
||||
[30] Char z: 0.17778361599336034 %
|
||||
[31] Char w: 0.11673113997213082 %
|
||||
[32] Char ç: 0.11016314744172959 %
|
||||
[33] Char ú: 0.08792153637287091 %
|
||||
[34] Char ü: 0.06709801460034885 %
|
||||
[35] Char ï: 0.05448448349082832 %
|
||||
|
||||
The first 36 characters have an accumulated ratio of 0.9997462366522347.
|
||||
The first 5 characters have an accumulated ratio of 0.4673171467147723.
|
||||
All characters whose order is over 21 have an accumulated ratio of 0.03321687585971664.
|
||||
|
||||
1083 sequences found.
|
||||
|
||||
First 517 (typical positive ratio): 0.9950067888087288
|
||||
Next 195 (712-517): 0.003994192320077694
|
||||
Rest: 0.0009990188711934689
|
||||
|
||||
- Processing end: 2022-12-20 01:34:38.859159
|
||||
246
script/BuildLangModelLogs/LangCroatianModel.log
Normal file
246
script/BuildLangModelLogs/LangCroatianModel.log
Normal file
@ -0,0 +1,246 @@
|
||||
= Logs of language model for Croatian (hr) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 21:45:13.277205
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Fizika čvrstog stanja (revision 6149421)
|
||||
Sila (revision 6392237)
|
||||
Valna duljina (revision 6188667)
|
||||
Polimer (revision 6162634)
|
||||
Mehanika (revision 6411078)
|
||||
Električni luk (revision 6411138)
|
||||
Kristalna rešetka (revision 6284090)
|
||||
Walter Houser Brattain (revision 6192985)
|
||||
Optika (revision 6212697)
|
||||
Kelvin (revision 6129568)
|
||||
Vodik (revision 6458324)
|
||||
Tvari (revision 6463183)
|
||||
Metan (revision 6411094)
|
||||
Dalekozor (revision 6153750)
|
||||
Pretvorba mjernih jedinica za temperaturu (revision 6164706)
|
||||
Mol (mjerna jedinica) (revision 6314382)
|
||||
Kilogram (revision 6386491)
|
||||
Opruga (revision 6411096)
|
||||
Luminiscencija (revision 6284106)
|
||||
Klor (revision 6235817)
|
||||
Kemijski element (revision 6313785)
|
||||
Drugi svjetski rat (revision 6546145)
|
||||
Celzij (revision 6351426)
|
||||
Kvark (revision 6232090)
|
||||
Silicij (revision 6407176)
|
||||
Centar masa (revision 6464428)
|
||||
Žarulja (revision 6334342)
|
||||
Informacija (revision 6382761)
|
||||
Richard Edward Taylor (revision 6462674)
|
||||
Livermorij (revision 6284954)
|
||||
Staklenički plinovi (revision 6411122)
|
||||
Ivan Supek (revision 6243478)
|
||||
Plastika (revision 6342661)
|
||||
Akustika (revision 6411111)
|
||||
Kolotura (revision 6383631)
|
||||
Vodikova veza (revision 6313784)
|
||||
Supravodljivost (revision 6349445)
|
||||
Biofizika (revision 6424045)
|
||||
Transformator (revision 6362760)
|
||||
Sila teža (revision 6174663)
|
||||
Elektrana (revision 6497146)
|
||||
Wolfgang Ketterle (revision 6193969)
|
||||
Teorija relativnosti (revision 6393386)
|
||||
Spreg sila (revision 6411171)
|
||||
Niz ugljik-dušik-kisik (revision 6203722)
|
||||
Kut (revision 6419168)
|
||||
Gravitacija (revision 6250724)
|
||||
Grijanje (revision 6510672)
|
||||
Boja (revision 6383771)
|
||||
Pieter Zeeman (revision 6391308)
|
||||
Daniel Bernoulli (revision 6427669)
|
||||
Willis Eugene Lamb (revision 6411089)
|
||||
Izvijanje (revision 6242753)
|
||||
PVC (revision 6446678)
|
||||
Monomeri (revision 5834634)
|
||||
Kisik (revision 6538984)
|
||||
Mjerna jedinica (revision 6221493)
|
||||
Markantun de Dominis (revision 6547370)
|
||||
13. listopada (revision 6348637)
|
||||
Bizmut (revision 6271464)
|
||||
Mjesec (revision 6532224)
|
||||
Nuklearni reaktor (revision 6203912)
|
||||
Leonhard Euler (revision 6531483)
|
||||
Kaučuk (revision 6401198)
|
||||
Nevill Francis Mott (revision 6216085)
|
||||
Infracrveno zračenje (revision 6403265)
|
||||
Metanski klatrat (revision 6411376)
|
||||
Titranje (revision 6446466)
|
||||
Neutron (revision 6456710)
|
||||
Jod (revision 6328974)
|
||||
Ljubljanski drveni kotač (revision 6496295)
|
||||
Rad (fizika) (revision 6400248)
|
||||
Paladij (revision 6158260)
|
||||
Ugljik (revision 6486622)
|
||||
Natrij (revision 6301527)
|
||||
Količina gibanja (revision 6235220)
|
||||
Fizika elementarnih čestica (revision 6424044)
|
||||
Električna energija (revision 6461505)
|
||||
Čavao (revision 6528601)
|
||||
Anthony James Leggett (revision 6277017)
|
||||
Stoka (revision 6362743)
|
||||
Frekvencija (revision 6493481)
|
||||
Metar (revision 6223720)
|
||||
SI osnovne jedinice (revision 6358142)
|
||||
Newtonova ljestvica (revision 6356055)
|
||||
Ravnoteža (mehanika) (revision 6168644)
|
||||
Kratki spoj (revision 6233794)
|
||||
Materijalizam (revision 6225072)
|
||||
Planckova temperatura (revision 6129559)
|
||||
Tiskana pločica (revision 6183823)
|
||||
Standardna atmosfera (revision 5907918)
|
||||
Sabatierova reakcija (revision 4997599)
|
||||
Aluminij (revision 6377870)
|
||||
Snaga (revision 6416113)
|
||||
Wolfgang Pauli (revision 6421065)
|
||||
Rezultanta (revision 6469772)
|
||||
Električni vodič (revision 6258708)
|
||||
Celzijeva temperaturna ljestvica (revision 6351426)
|
||||
Michel Mayor (revision 6223128)
|
||||
Prijenos topline (revision 6479591)
|
||||
Rankineova ljestvica (revision 6355289)
|
||||
Newtonovi zakoni gibanja (revision 6506020)
|
||||
Jevgraf Stepanovič Fjodorov (revision 6411391)
|
||||
Kemijska skupina (revision 5144226)
|
||||
Brzina svjetlosti (revision 6513071)
|
||||
Antički Rim (revision 6503914)
|
||||
Molekula (revision 6221038)
|
||||
Val (revision 6403857)
|
||||
Duncan Haldane (revision 6259892)
|
||||
Elementarna čestica (revision 6258638)
|
||||
Elastomer (revision 6258788)
|
||||
Elektron (revision 6470677)
|
||||
Materijal (revision 6318671)
|
||||
Kuhinjska sol (revision 6152740)
|
||||
Oganeson (revision 6213574)
|
||||
Pokus (revision 6162558)
|
||||
Robert Woodrow Wilson (revision 6170406)
|
||||
Ubrzanje (revision 6463804)
|
||||
Ljudi (revision 6500701)
|
||||
Planet (revision 6432483)
|
||||
Temperatura (revision 6541246)
|
||||
Svijest (revision 6403170)
|
||||
Raketa (revision 6469758)
|
||||
Fluor (revision 6358745)
|
||||
Slaba nuklearna sila (revision 6175551)
|
||||
Lepton (revision 6525041)
|
||||
Mehanika fluida (revision 6155246)
|
||||
Higgsov bozon (revision 6464426)
|
||||
Galij (revision 6253347)
|
||||
Čandrasekara Venkata Raman (revision 6198967)
|
||||
Tvrdoća po Rockwellu (revision 6318988)
|
||||
Krvni tlak (revision 5549040)
|
||||
Donji kvark (revision 6261061)
|
||||
Prijelazni metali (revision 5424792)
|
||||
Newtonov zakon gravitacije (revision 6391956)
|
||||
Sheldon Lee Glashow (revision 6174061)
|
||||
Henry Cavendish (revision 6248484)
|
||||
Majtnerij (revision 6227452)
|
||||
Rømerova ljestvica (revision 6355902)
|
||||
W (revision 5642521)
|
||||
Bakar (element) (revision 6519638)
|
||||
Takaaki Kajita (revision 6181477)
|
||||
Atom (revision 6527497)
|
||||
Tava (revision 6181948)
|
||||
Halogeni elementi (revision 5676937)
|
||||
Refrakcija (revision 6169023)
|
||||
Avogadrov broj (revision 6274688)
|
||||
Civilizacija (revision 5753049)
|
||||
Ivar Giaever (revision 6243230)
|
||||
Vrijeme (fizika) (revision 6411075)
|
||||
James Peebles (revision 6462671)
|
||||
HNK Hajduk Split (revision 6542426)
|
||||
Latinski jezik (revision 6537991)
|
||||
Latinski (revision 6537991)
|
||||
Helij (revision 6328120)
|
||||
Pojačala snage (revision 5445726)
|
||||
Otto Stern (revision 6211600)
|
||||
Stočarstvo (revision 6343293)
|
||||
Aleksej Aleksejevič Abrikosov (revision 6279401)
|
||||
Vektor (revision 6189866)
|
||||
Voda (revision 6546156)
|
||||
Jednadžba (revision 6419050)
|
||||
Zapadnjačka filozofija (revision 6195212)
|
||||
Barij (revision 6326495)
|
||||
Međunarodna unija za čistu i primijenjenu kemiju (revision 6223441)
|
||||
Gustoća (revision 6359052)
|
||||
Vodna turbina (revision 6209527)
|
||||
SI (revision 6370483)
|
||||
Volumen (revision 6514423)
|
||||
Zvuk (revision 6463880)
|
||||
Termoakumulacijska peć (revision 6182467)
|
||||
Orbitale (revision 6212291)
|
||||
Moskovij (revision 6340094)
|
||||
Elektroslaba teorija (revision 6258647)
|
||||
Toplina isparavanja (revision 6400287)
|
||||
Talij (revision 6181570)
|
||||
Kvantna mehanika (revision 6411085)
|
||||
Americij (revision 6325584)
|
||||
Rendgenske zrake (revision 6544752)
|
||||
Njemački jezik (revision 6539116)
|
||||
Fizikalne konstante (revision 6255560)
|
||||
Bura (revision 6529413)
|
||||
Farad (revision 4779053)
|
||||
Heksan (revision 6248694)
|
||||
William John Macquorn Rankine (revision 6427777)
|
||||
Izmjenična električna struja (revision 6504217)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 21:57:58.987423
|
||||
|
||||
54 characters appeared 1243479 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 10.914699805947668 %
|
||||
[ 1] Char i: 9.602333453158437 %
|
||||
[ 2] Char e: 9.441574807455535 %
|
||||
[ 3] Char o: 8.609876001122657 %
|
||||
[ 4] Char n: 6.6901813380041 %
|
||||
[ 5] Char r: 5.5092205015122895 %
|
||||
[ 6] Char j: 5.200650754857943 %
|
||||
[ 7] Char t: 5.182475940486329 %
|
||||
[ 8] Char s: 4.7983118331712875 %
|
||||
[ 9] Char u: 4.0200115964966034 %
|
||||
[10] Char k: 3.856518686684697 %
|
||||
[11] Char l: 3.7823718776111215 %
|
||||
[12] Char m: 3.522616787255756 %
|
||||
[13] Char v: 3.2415505207566837 %
|
||||
[14] Char d: 2.9616101277142595 %
|
||||
[15] Char p: 2.7908794599667544 %
|
||||
[16] Char z: 1.9033694979971514 %
|
||||
[17] Char g: 1.5358522339339868 %
|
||||
[18] Char b: 1.2457789797817254 %
|
||||
[19] Char c: 1.1129259118971853 %
|
||||
[20] Char č: 1.0259119776047685 %
|
||||
[21] Char h: 0.7720275131304991 %
|
||||
[22] Char š: 0.4921675396206932 %
|
||||
[23] Char f: 0.45927595078002925 %
|
||||
[24] Char ć: 0.41295429999219935 %
|
||||
[25] Char ž: 0.4000871747733577 %
|
||||
[26] Char đ: 0.1987166650984858 %
|
||||
[27] Char y: 0.18303485623802251 %
|
||||
[28] Char w: 0.06996499337745149 %
|
||||
[29] Char x: 0.034097881829930385 %
|
||||
[30] Char q: 0.01431467680596134 %
|
||||
|
||||
The first 31 characters have an accumulated ratio of 0.9998536364506355.
|
||||
The first 5 characters have an accumulated ratio of 0.4525866540568839.
|
||||
All characters whose order is over 20 have an accumulated ratio of 0.0303664155164663.
|
||||
|
||||
814 sequences found.
|
||||
|
||||
First 402 (typical positive ratio): 0.9950352500193897
|
||||
Next 120 (522-402): 0.003975137961536701
|
||||
Rest: 0.0009896120190735491
|
||||
|
||||
- Processing end: 2022-12-14 21:57:59.324528
|
||||
238
script/BuildLangModelLogs/LangCzechModel.log
Normal file
238
script/BuildLangModelLogs/LangCzechModel.log
Normal file
@ -0,0 +1,238 @@
|
||||
= Logs of language model for Czech (cs) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 20:21:08.161564
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Sociální fobie (revision 22020472)
|
||||
Alkoholismus (revision 22196482)
|
||||
Obsedantně kompulzivní porucha (revision 22207124)
|
||||
Beta-blokátor (revision 21048224)
|
||||
Tréma (revision 22031372)
|
||||
Národní parlamentní knihovna Japonska (revision 21018639)
|
||||
Alprazolam (revision 21750289)
|
||||
Schizofrenie (revision 21709687)
|
||||
Predestinace (revision 22029350)
|
||||
Šógunát Tokugawa (revision 22213068)
|
||||
Serotonin (revision 21724759)
|
||||
Národní knihovna Španělska (revision 21819713)
|
||||
Národnost (revision 22181364)
|
||||
Mytí rukou (revision 21428406)
|
||||
Národní knihovna České republiky (revision 22177708)
|
||||
Kalvinismus (revision 21693188)
|
||||
Delirium tremens (revision 21223330)
|
||||
Library of Congress Control Number (revision 19355161)
|
||||
Příčně pruhovaná svalovina (revision 20442240)
|
||||
Japonština (revision 22023125)
|
||||
Virtual International Authority File (revision 21184542)
|
||||
Jan Kalvín (revision 21942173)
|
||||
Zinkový prst (revision 21309616)
|
||||
Vražda (revision 21956156)
|
||||
Pivo (revision 22181432)
|
||||
Univerzitní systém dokumentace (revision 21061529)
|
||||
Starořečtina (revision 22071115)
|
||||
Tulip (revision 21245729)
|
||||
Hraniční porucha osobnosti (revision 21373755)
|
||||
Polská národní knihovna (revision 21273191)
|
||||
Autorita (knihovnictví) (revision 21919206)
|
||||
Tony Shalhoub (revision 22120821)
|
||||
Lotyšská národní knihovna (revision 21809399)
|
||||
Národní knihovna Izraele (revision 20491004)
|
||||
Mezinárodní klasifikace nemocí (revision 21565517)
|
||||
Latina (revision 21868129)
|
||||
Kyselina gama-aminomáselná (revision 19923822)
|
||||
Gemeinsame Normdatei (revision 20842200)
|
||||
International Standard Book Number (revision 21443136)
|
||||
Játra (revision 21732676)
|
||||
Vatikánská apoštolská knihovna (revision 20529679)
|
||||
Hladká svalovina (revision 21837789)
|
||||
Psychoterapie (revision 21818892)
|
||||
Církev (revision 21907627)
|
||||
Tokio (revision 21751763)
|
||||
Hypothalamus (revision 21171603)
|
||||
Tachykardie (revision 20455710)
|
||||
Japonský parlament (revision 19000717)
|
||||
Psychika (revision 21886751)
|
||||
James W. Black (revision 22073892)
|
||||
WorldCat (revision 21510754)
|
||||
Bezvědomí (revision 22201907)
|
||||
Čeština (revision 21883072)
|
||||
Emoce (revision 21879452)
|
||||
Library of Congress (revision 20205897)
|
||||
Sympatikus (revision 20346074)
|
||||
Kognitivně behaviorální terapie (revision 21301071)
|
||||
Náboženství (revision 21691676)
|
||||
Katecholamin (revision 20335142)
|
||||
Francouzská národní knihovna (revision 20503017)
|
||||
Poruchy příjmu potravy (revision 21792957)
|
||||
Polytematický strukturovaný heslář (revision 20359962)
|
||||
Křesťanství (revision 22184649)
|
||||
Lékař (revision 21531442)
|
||||
Kvašení (revision 21159147)
|
||||
Lék (revision 22208230)
|
||||
Očekávaná délka života (revision 22203408)
|
||||
Svatý Augustin (revision 22029348)
|
||||
Kouření (revision 22186960)
|
||||
Národní a univerzitní knihovna v Záhřebu (revision 21291658)
|
||||
Midazolam (revision 20745654)
|
||||
Tkáň (revision 21701830)
|
||||
Středověk (revision 22180362)
|
||||
Barvoslepost (revision 21425886)
|
||||
Strach (revision 20549071)
|
||||
Indikace (lékařství) (revision 20833751)
|
||||
Histidin (revision 21686432)
|
||||
Václav Šubert (revision 21971945)
|
||||
Francouzština (revision 22166917)
|
||||
Noradrenalin (revision 21238417)
|
||||
1545 (revision 22181825)
|
||||
1648 (revision 21425329)
|
||||
Hydroxyl (revision 20606297)
|
||||
Řecká národní knihovna (revision 21292640)
|
||||
Curych (revision 22173470)
|
||||
Sčítání lidu 2011 (revision 21412557)
|
||||
Muzikoterapie (revision 21861522)
|
||||
Uhersko (revision 22059638)
|
||||
1619 (revision 20732281)
|
||||
Národní knihovna Chile (revision 22189619)
|
||||
Biodostupnost (revision 20289543)
|
||||
Prostata (revision 21774332)
|
||||
Rakovina (revision 21909777)
|
||||
Erving Goffman (revision 22055829)
|
||||
Adrenalin (revision 21886991)
|
||||
Beta-skládaný list (revision 20282664)
|
||||
Vazivo (revision 22209213)
|
||||
Benzo(a)pyren (revision 21798637)
|
||||
Přízvuk (revision 20671123)
|
||||
Český jazykový atlas (revision 18945974)
|
||||
Plusquamperfektum (revision 20879161)
|
||||
Psychiatrie (revision 20502746)
|
||||
Psychologie (revision 21840396)
|
||||
Alveolární laterální aproximanta (revision 20285060)
|
||||
Opožděná bolest svalů (revision 21738585)
|
||||
Pověst (revision 21934490)
|
||||
Infúze (revision 21275511)
|
||||
Behaviorální psychoterapie (revision 11059677)
|
||||
Deponentní sloveso (revision 20295633)
|
||||
28. srpen (revision 21626735)
|
||||
Fakulta (revision 19416104)
|
||||
Panoráma (revision 21210018)
|
||||
Zákaz kouření (revision 20956994)
|
||||
Sval krejčovský (revision 20367825)
|
||||
Varolův most (revision 20352546)
|
||||
Česká tisková kancelář (revision 21721395)
|
||||
Michael S. Brown (revision 22090607)
|
||||
1800 (revision 20878320)
|
||||
Mozek (revision 22201687)
|
||||
Lékařská fakulta Univerzity Palackého (revision 21371716)
|
||||
Tyrosin (revision 20778060)
|
||||
Opilost (revision 21376325)
|
||||
Glotální souhláska (revision 20310894)
|
||||
Neurony (revision 21791635)
|
||||
Duševní porucha (revision 21595942)
|
||||
Lidské oko (revision 21948364)
|
||||
Peníze (revision 21410625)
|
||||
Spolek (revision 21018723)
|
||||
1834 (revision 22194449)
|
||||
Elektronická cigareta (revision 21411137)
|
||||
Svatý Ambrož (revision 21924230)
|
||||
Státní ústav pro kontrolu léčiv (revision 20915059)
|
||||
Bibliografie (revision 21639559)
|
||||
České stavovské povstání (revision 21921182)
|
||||
Che Guevara (revision 22065779)
|
||||
Varixy (revision 20668715)
|
||||
Výjimečný trest (revision 20574730)
|
||||
1618 (revision 20732127)
|
||||
Česko (revision 22170403)
|
||||
2001 (revision 21769254)
|
||||
Bílkovina (revision 21864645)
|
||||
Centrální nervová soustava (revision 18275861)
|
||||
Spojené státy americké (revision 22179989)
|
||||
Judaismus (revision 22206394)
|
||||
Halucinace (revision 21860029)
|
||||
Martin Heidegger (revision 22038120)
|
||||
Léčivý přípravek (revision 20846808)
|
||||
Ethnologue (revision 20305961)
|
||||
Kůže (revision 21989451)
|
||||
Mezenchym (revision 21046685)
|
||||
Obec křesťanů (revision 21682273)
|
||||
1898 (revision 21630487)
|
||||
Osteoporóza (revision 20589463)
|
||||
Mezinárodní standardní číslo audiovizuálního díla (revision 22211957)
|
||||
Česká terminologická databáze knihovnictví a informační vědy (revision 22188254)
|
||||
Chromozom (revision 21601734)
|
||||
Simple Knowledge Organization System (revision 20365379)
|
||||
MARC (revision 15943911)
|
||||
Fénické písmo (revision 20690142)
|
||||
Adenosinmonofosfát (revision 21921144)
|
||||
Oxytocin (revision 20194250)
|
||||
Izrael (revision 21909756)
|
||||
Nizozemská královská knihovna (revision 21819389)
|
||||
Washington, D.C. (revision 22030685)
|
||||
Evropská unie (revision 22219884)
|
||||
Harry Stack Sullivan (revision 22011659)
|
||||
Vazal (revision 22189033)
|
||||
Diktatura (revision 21684808)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 20:34:20.075639
|
||||
|
||||
72 characters appeared 1422585 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char o: 8.036567234998259 %
|
||||
[ 1] Char e: 7.506967949191084 %
|
||||
[ 2] Char a: 6.896037846596161 %
|
||||
[ 3] Char n: 6.845496051202564 %
|
||||
[ 4] Char t: 5.167424090651877 %
|
||||
[ 5] Char i: 5.077236158120605 %
|
||||
[ 6] Char s: 4.775391277146884 %
|
||||
[ 7] Char v: 4.340267892603957 %
|
||||
[ 8] Char r: 4.24143372803734 %
|
||||
[ 9] Char k: 4.069001149316209 %
|
||||
[10] Char l: 4.035330050576943 %
|
||||
[11] Char u: 3.1477908174204003 %
|
||||
[12] Char p: 3.1040675952579284 %
|
||||
[13] Char d: 3.044106327565664 %
|
||||
[14] Char í: 3.042841025316589 %
|
||||
[15] Char m: 2.978029432336205 %
|
||||
[16] Char c: 2.8064403884477906 %
|
||||
[17] Char h: 2.3411606336352486 %
|
||||
[18] Char z: 2.1364628475627114 %
|
||||
[19] Char á: 2.125004832751646 %
|
||||
[20] Char j: 1.8273073313721147 %
|
||||
[21] Char y: 1.7880126670814047 %
|
||||
[22] Char b: 1.5920314076136046 %
|
||||
[23] Char ě: 1.2533521722779308 %
|
||||
[24] Char é: 1.2440732891180493 %
|
||||
[25] Char ř: 1.0416249292660895 %
|
||||
[26] Char č: 0.9885525293743431 %
|
||||
[27] Char ý: 0.9577635079801909 %
|
||||
[28] Char ž: 0.792008913351399 %
|
||||
[29] Char š: 0.6496624103304899 %
|
||||
[30] Char g: 0.561723904019795 %
|
||||
[31] Char ů: 0.5182115655655023 %
|
||||
[32] Char f: 0.49466288481883336 %
|
||||
[33] Char ú: 0.15443716895651224 %
|
||||
[34] Char x: 0.1142286752636925 %
|
||||
[35] Char w: 0.11120600877979171 %
|
||||
[36] Char ň: 0.06663925178460338 %
|
||||
[37] Char ť: 0.044004400440044 %
|
||||
[38] Char ó: 0.039786726276461515 %
|
||||
[39] Char ď: 0.012442138782568354 %
|
||||
[40] Char q: 0.011317425672279688 %
|
||||
|
||||
The first 41 characters have an accumulated ratio of 0.999801066368618.
|
||||
The first 7 characters have an accumulated ratio of 0.4430512060790744.
|
||||
All characters whose order is over 27 have an accumulated ratio of 0.035703314740419725.
|
||||
|
||||
1372 sequences found.
|
||||
|
||||
First 749 (typical positive ratio): 0.995023909981167
|
||||
Next 205 (954-749): 0.003983584242635896
|
||||
Rest: 0.000992505776197139
|
||||
|
||||
- Processing end: 2022-12-14 20:34:20.424560
|
||||
232
script/BuildLangModelLogs/LangDanishModel.log
Normal file
232
script/BuildLangModelLogs/LangDanishModel.log
Normal file
@ -0,0 +1,232 @@
|
||||
= Logs of language model for Danish (da) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 21:48:43.045295
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Forside (revision 10000691)
|
||||
Middelaldercentret (revision 11351327)
|
||||
Mauna Loa (revision 11344555)
|
||||
Jimi Hendrix (revision 11344444)
|
||||
9. december (revision 10927383)
|
||||
Mette Frederiksen (revision 11354825)
|
||||
Rigsretssag (revision 11093257)
|
||||
Brandts (revision 11336216)
|
||||
Kattegat (revision 10764555)
|
||||
Sydpolsekspeditioner (revision 11090364)
|
||||
Casper & Mandrilaftalen (revision 11221713)
|
||||
Nordrhein-Westfalen (revision 10972283)
|
||||
Is (revision 11259689)
|
||||
Bjarne Laustsen (revision 10778277)
|
||||
Frejlev (Guldborgsund Kommune) (revision 10663830)
|
||||
Musikgenre (revision 11209814)
|
||||
Faaborg Museum (revision 10593221)
|
||||
Klassicisme (revision 11202332)
|
||||
Forfatningsdomstolen i Den Russiske Føderation (revision 11276313)
|
||||
Gemeinsame Normdatei (revision 11281765)
|
||||
1993 (revision 11303939)
|
||||
Isbjerg (revision 11318087)
|
||||
Mink (revision 11348096)
|
||||
Sverige (revision 11328486)
|
||||
Internationalt Standardbognummer (revision 11037702)
|
||||
Regeringen Jens Otto Krag III (revision 10416573)
|
||||
Julius Cæsar (revision 11313394)
|
||||
Moers (revision 10160258)
|
||||
Henrik Sass Larsen (revision 11338905)
|
||||
Esben Pretzmann (revision 11280112)
|
||||
2010 (revision 11200088)
|
||||
Wayback Machine (revision 11280977)
|
||||
Dumaen (revision 11205571)
|
||||
Fernando Collor de Mello (revision 10254545)
|
||||
Carl Petersen (polarfarer) (revision 11322726)
|
||||
Odense Bys Museer (revision 11277380)
|
||||
Euroman (revision 11093296)
|
||||
Olaf Rude (revision 10261097)
|
||||
Hallands Väderö (revision 9381458)
|
||||
Mecklenburg-Vorpommern (revision 11048800)
|
||||
Regatta (revision 8395677)
|
||||
Pixar Animation Studios (revision 11343980)
|
||||
Lasse Rimmer (revision 11332179)
|
||||
Erik Ninn-Hansen (revision 10931231)
|
||||
Skjoldvulkan (revision 10870812)
|
||||
Kreis Paderborn (revision 10780457)
|
||||
Durban (revision 10163074)
|
||||
Den tyske antarktisekspedition 1911–13 (revision 11221987)
|
||||
Andrew Loog Oldham (revision 11020595)
|
||||
Guitar (revision 11074541)
|
||||
Kirk Douglas (revision 10972253)
|
||||
Sne (revision 11195384)
|
||||
Basalt (revision 11257623)
|
||||
Kunstmuseum (revision 10915075)
|
||||
Brakvand (revision 11059940)
|
||||
Derek and the Dominos (revision 7842204)
|
||||
Widukind (revision 10464146)
|
||||
Kuldioxid (revision 11307051)
|
||||
Passatvind (revision 10395443)
|
||||
Gudenå (revision 11239800)
|
||||
Brandts Klædefabrik (revision 11236311)
|
||||
Højesteret (revision 11317827)
|
||||
Konspirationsteori (revision 11319148)
|
||||
Somalia (revision 11325370)
|
||||
London (revision 11350614)
|
||||
Repræsentanternes hus (revision 11181607)
|
||||
Medievidenskab (revision 10707008)
|
||||
Bibliothèque nationale de France (revision 11055813)
|
||||
Kunstindeks Danmark (revision 9867315)
|
||||
2013 (revision 11214281)
|
||||
Dortmund (revision 11222064)
|
||||
Scotia-ekspeditionen (revision 11324525)
|
||||
Universe (revision 11211772)
|
||||
Krystal (revision 11352535)
|
||||
Frank Hvam (revision 11231772)
|
||||
Karsten Lauritzen (revision 11338762)
|
||||
Frederikshavn (revision 11247372)
|
||||
Sørine Gotfredsen (revision 11200560)
|
||||
28. december (revision 6878014)
|
||||
Det Danske Filminstitut (revision 10997618)
|
||||
Jens Juel (revision 10931652)
|
||||
Samfundsvidenskab (revision 11241111)
|
||||
TheTVDB (revision 10969052)
|
||||
1869 (revision 11181496)
|
||||
Nansens Fram-ekspedition (revision 11321339)
|
||||
Dogme 95 (revision 10973606)
|
||||
Tjekkisk nationalbibliotek (revision 9639333)
|
||||
Jordskred (revision 10982742)
|
||||
Socialistisk Folkeparti (revision 11334364)
|
||||
Horsens (revision 11349176)
|
||||
Laurits Tuxen (revision 11094258)
|
||||
Berlingske Tidende (revision 11330268)
|
||||
Roskilde (revision 11323983)
|
||||
Annette Lind (revision 11295631)
|
||||
Storhertugdømmet Oldenborg (revision 10921923)
|
||||
1842 (revision 11166436)
|
||||
Odense (revision 11335838)
|
||||
1950 (revision 11340086)
|
||||
Stortinget (revision 11187972)
|
||||
Kramnitse (revision 11228337)
|
||||
Danmarks Borgcenter (revision 11160197)
|
||||
Autoritetsdata (revision 11213971)
|
||||
Discogs (revision 10585542)
|
||||
Bundesverfassungsgericht (revision 10762218)
|
||||
Frances Reid (revision 9482242)
|
||||
Uffe Elbæk (revision 11330100)
|
||||
Skälderviken (revision 11076540)
|
||||
Camilla Fabricius (revision 11224056)
|
||||
Andrew Johnson (revision 11280464)
|
||||
Ryder Hesjedal (revision 11210377)
|
||||
Syrerock (revision 10305968)
|
||||
Tunø (revision 11283297)
|
||||
Oberyn Martell (revision 10993021)
|
||||
Is (fødevare) (revision 11252696)
|
||||
Nationale parlamentsbibliotek (revision 10001351)
|
||||
Mandag (revision 11210797)
|
||||
Julie Skovsby (revision 11338912)
|
||||
Håris (revision 11062058)
|
||||
Århusbugten (revision 11294988)
|
||||
Historiker (revision 10949929)
|
||||
Drivhusgas (revision 10974686)
|
||||
Domstol (revision 11265163)
|
||||
Christlich Demokratische Union Deutschlands (revision 11252453)
|
||||
Isslag (revision 11037455)
|
||||
Podcast (revision 11079901)
|
||||
Horslunde (revision 11304807)
|
||||
Jordens atmosfære (revision 11318446)
|
||||
Sejerø (revision 11254317)
|
||||
Harald Giersing (revision 9122370)
|
||||
Google (revision 11233546)
|
||||
Norge (revision 11332499)
|
||||
Endurance-ekspeditionen (revision 11314743)
|
||||
Nørre Alslev (revision 11055825)
|
||||
Anton van Dyck (revision 11047560)
|
||||
1941 (revision 11340092)
|
||||
Europaparlamentsvalget 2004 (revision 10592050)
|
||||
Kommunisme (revision 11319073)
|
||||
William H. Taft (revision 10976270)
|
||||
Fredag (revision 10597397)
|
||||
Støjrock (revision 11325939)
|
||||
Den Store Danske Encyklopædi (revision 11301417)
|
||||
Storbritannien (revision 11329834)
|
||||
1479 (revision 8418183)
|
||||
BIBSYS (revision 11277126)
|
||||
England (revision 11285614)
|
||||
Live fra Bremen (revision 11071090)
|
||||
Registreret partnerskab (revision 10998951)
|
||||
Forfatter (revision 11232460)
|
||||
Blakshøj jættestue (revision 10884380)
|
||||
Realkreditinstitut (revision 10701454)
|
||||
William Tecumseh Sherman (revision 11324708)
|
||||
Rasmus Horn Langhoff (revision 11056736)
|
||||
Sofie Carsten Nielsen (revision 11339465)
|
||||
Figurmaleri (revision 10912483)
|
||||
Museet for Fotokunst (revision 11141794)
|
||||
Wiktionary (revision 8998237)
|
||||
Virtual International Authority File (revision 8702589)
|
||||
Sakskøbing Å (revision 11210208)
|
||||
Jorden (revision 11353567)
|
||||
UNFCCC (revision 11058172)
|
||||
Tyskland (revision 11327013)
|
||||
Danmarks Miljøundersøgelser (revision 11233391)
|
||||
AllMovie (revision 11199258)
|
||||
Jadebusen (revision 9736139)
|
||||
Taktslag (revision 10482631)
|
||||
Tao Geoghegan Hart (revision 11232300)
|
||||
1860'erne (revision 8151963)
|
||||
Franklin Pierce (revision 10976258)
|
||||
Periodikum (revision 10226798)
|
||||
USA (revision 11353896)
|
||||
Carl Anton Larsen (revision 10979268)
|
||||
16. december (revision 9709998)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 21:51:50.797330
|
||||
|
||||
55 characters appeared 1257348 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 15.172728632009594 %
|
||||
[ 1] Char r: 8.643112328488215 %
|
||||
[ 2] Char n: 7.764198932992299 %
|
||||
[ 3] Char t: 6.993608770205226 %
|
||||
[ 4] Char s: 6.410953848894657 %
|
||||
[ 5] Char a: 6.373494052561422 %
|
||||
[ 6] Char i: 6.270817625669266 %
|
||||
[ 7] Char d: 6.0302318848878755 %
|
||||
[ 8] Char o: 5.058345024607348 %
|
||||
[ 9] Char l: 4.955907195144065 %
|
||||
[10] Char g: 3.8945462990357482 %
|
||||
[11] Char m: 3.2335518885781824 %
|
||||
[12] Char k: 3.1113900049946395 %
|
||||
[13] Char f: 2.6197202365613976 %
|
||||
[14] Char v: 2.2084578016587293 %
|
||||
[15] Char u: 1.8723535568514047 %
|
||||
[16] Char b: 1.755520349179384 %
|
||||
[17] Char h: 1.5780833945733401 %
|
||||
[18] Char p: 1.4999825028552158 %
|
||||
[19] Char ø: 0.8079704266440159 %
|
||||
[20] Char å: 0.7898370220495837 %
|
||||
[21] Char y: 0.7276426255897332 %
|
||||
[22] Char æ: 0.7048963373704018 %
|
||||
[23] Char c: 0.6372142000464469 %
|
||||
[24] Char j: 0.5855180904570573 %
|
||||
[25] Char w: 0.1428403274192984 %
|
||||
[26] Char x: 0.04954873272952278 %
|
||||
[27] Char z: 0.04636743367786802 %
|
||||
[28] Char é: 0.015588365353108287 %
|
||||
[29] Char ö: 0.011293611633374372 %
|
||||
[30] Char q: 0.011055014204500266 %
|
||||
|
||||
The first 31 characters have an accumulated ratio of 0.9997677651692288.
|
||||
The first 5 characters have an accumulated ratio of 0.44984602512589994.
|
||||
All characters whose order is over 19 have an accumulated ratio of 0.037218017605308955.
|
||||
|
||||
925 sequences found.
|
||||
|
||||
First 485 (typical positive ratio): 0.9950259492446578
|
||||
Next 131 (616-485): 0.003975741174389147
|
||||
Rest: 0.0009983095809530385
|
||||
|
||||
- Processing end: 2022-12-14 21:51:50.880914
|
||||
251
script/BuildLangModelLogs/LangEnglishModel.log
Normal file
251
script/BuildLangModelLogs/LangEnglishModel.log
Normal file
@ -0,0 +1,251 @@
|
||||
= Logs of language model for English (en) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 20:20:53.218193
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Marmot (revision 1116705550)
|
||||
Barcode of Life Data System (revision 1090221883)
|
||||
Palmer's chipmunk (revision 1121473732)
|
||||
Jacopo Ligozzi (revision 1104222073)
|
||||
Olympic Peninsula (revision 1123430023)
|
||||
INaturalist (revision 1122751314)
|
||||
Mammal Species of the World (revision 1127351948)
|
||||
Berry (revision 1112801626)
|
||||
Rock squirrel (revision 1121470993)
|
||||
Natural reservoir (revision 1110806364)
|
||||
Onomatopoeia (revision 1120663626)
|
||||
Mohave ground squirrel (revision 1121470764)
|
||||
Townsend's chipmunk (revision 1121473824)
|
||||
Madrid (revision 1126851882)
|
||||
Otospermophilus (revision 1093268410)
|
||||
Plant hormone (revision 1116921032)
|
||||
Cuckoo (revision 1126465747)
|
||||
Daurian ground squirrel (revision 1121469422)
|
||||
Elwha River (revision 1121691243)
|
||||
All rights reserved (revision 1125321157)
|
||||
Long-tailed ground squirrel (revision 1121468895)
|
||||
CDFG (revision 1122725741)
|
||||
Don Martin (cartoonist) (revision 1116900902)
|
||||
Palindromic (revision 1121604941)
|
||||
EMBnet (revision 1018817077)
|
||||
Ferdinando II de' Medici, Grand Duke of Tuscany (revision 1125579637)
|
||||
Cloister (revision 1120569425)
|
||||
Asymptomatic (revision 1111685734)
|
||||
Grand Duke (revision 1126227666)
|
||||
Eucalyptus oil (revision 1123039166)
|
||||
Seattle (revision 1127044692)
|
||||
Xerospermophilus (revision 1095542738)
|
||||
Red-cheeked ground squirrel (revision 1121469468)
|
||||
Roy Crane (revision 1073477180)
|
||||
Round-tailed ground squirrel (revision 1121470819)
|
||||
Asia Minor ground squirrel (revision 1121357197)
|
||||
Ictidomys parvidens (revision 1121470382)
|
||||
Hopi chipmunk (revision 1121297258)
|
||||
Anecdata.org (revision 1099498174)
|
||||
Himalayan marmot (revision 1113552191)
|
||||
Storage organ (revision 1087238870)
|
||||
Phage therapy (revision 1115823876)
|
||||
Pacific County, Washington (revision 1115141058)
|
||||
Agostino Carracci (revision 1118965396)
|
||||
Share-alike (revision 1124025423)
|
||||
Fragaria chiloensis (revision 1117621684)
|
||||
Pacific Northwest (revision 1125120564)
|
||||
Eastern chipmunk (revision 1120765340)
|
||||
Yakima County, Washington (revision 1117226237)
|
||||
United States congressional delegations from Washington (revision 1113282930)
|
||||
Hygiene (revision 1121837793)
|
||||
Synonym (taxonomy) (revision 1115465643)
|
||||
Washington's congressional districts (revision 1126665844)
|
||||
Culling (revision 1124588069)
|
||||
Citizen scientists (revision 1126971493)
|
||||
Accademia delle Arti del Disegno (revision 1117591379)
|
||||
Lacey, Washington (revision 1118158829)
|
||||
Berberis thunbergii (revision 1098470800)
|
||||
James Joyce (revision 1127091935)
|
||||
Interim Register of Marine and Nonmarine Genera (revision 1093112130)
|
||||
Marker-assisted selection (revision 1101841526)
|
||||
Blood-borne disease (revision 1104089084)
|
||||
Research in Computational Molecular Biology (revision 1098389228)
|
||||
Eggplant (revision 1127383368)
|
||||
Purr (revision 1125642484)
|
||||
Blastomycosis (revision 1125999120)
|
||||
NatureServe (revision 1122446327)
|
||||
Xerinae (revision 1093432948)
|
||||
Baja California rock squirrel (revision 1121471079)
|
||||
Lodgepole chipmunk (revision 1121296771)
|
||||
Honey (revision 1127398567)
|
||||
Bouba/kiki effect (revision 1127022127)
|
||||
Ferdinando I de' Medici, Grand Duke of Tuscany (revision 1125114864)
|
||||
Medici (revision 1123423946)
|
||||
Bristlecone pine (revision 1108725770)
|
||||
Morphology (biology) (revision 1126240066)
|
||||
Albanian language (revision 1127442244)
|
||||
Taurus ground squirrel (revision 1121469893)
|
||||
World Environment Day (revision 1119598477)
|
||||
The New York Times (revision 1127291077)
|
||||
Rat Genome Database (revision 1121949622)
|
||||
Geobotanical prospecting (revision 992549326)
|
||||
Pre-exposure prophylaxis (revision 1121706582)
|
||||
Least chipmunk (revision 1120765536)
|
||||
EcoHealth Alliance (revision 1124297887)
|
||||
InterPro (revision 1123732177)
|
||||
Gunnison's prairie dog (revision 1121472300)
|
||||
EMBOSS (revision 1108898594)
|
||||
Black-capped marmot (revision 1121471697)
|
||||
Speckled ground squirrel (revision 1121469813)
|
||||
National Gallery of Art (revision 1124058120)
|
||||
Ground squirrel (revision 1106618817)
|
||||
Texas antelope squirrel (revision 1121470154)
|
||||
Skamania County, Washington (revision 1115141102)
|
||||
Zebrafish Information Network (revision 1084187264)
|
||||
Merriam's chipmunk (revision 1121301344)
|
||||
Stamen (revision 1107327988)
|
||||
Plant stem (revision 1125685714)
|
||||
Uinta chipmunk (revision 1121367930)
|
||||
Public Lab (revision 1123308321)
|
||||
Sierra Madre ground squirrel (revision 1121471267)
|
||||
Scripps Research (revision 1120793534)
|
||||
Morbillivirus (revision 1123109002)
|
||||
Conservation status (revision 1126423906)
|
||||
Korean language (revision 1127097954)
|
||||
Flatiron Institute (revision 1114126605)
|
||||
Espíritu Santo antelope squirrel (revision 1121470113)
|
||||
Pietre dure (revision 1124553077)
|
||||
List of biological databases (revision 1116920095)
|
||||
Needle sharing (revision 1066293994)
|
||||
ISCB Africa ASBCB Conference on Bioinformatics (revision 1003545343)
|
||||
Northern Idaho ground squirrel (revision 1123076448)
|
||||
Animal track (revision 1112366053)
|
||||
HMMER (revision 1090926305)
|
||||
RERO (identifier) (revision 1068185782)
|
||||
Catalogue of Life (revision 1118132647)
|
||||
Francesco I de' Medici, Grand Duke of Tuscany (revision 1123286810)
|
||||
Whip-poor-will (revision 1120975767)
|
||||
Doi (identifier) (revision 1127429235)
|
||||
Wildlife Conservation Society (revision 1125787985)
|
||||
Panamint chipmunk (revision 1121299808)
|
||||
Bioblitz (revision 1113263878)
|
||||
Habitat loss (revision 1117935852)
|
||||
Sciuromorpha (revision 1107286064)
|
||||
Yellow-bellied marmot (revision 1121472145)
|
||||
Allen's chipmunk (revision 1121299548)
|
||||
Hood Canal (revision 1124856006)
|
||||
Computer vision (revision 1126383414)
|
||||
Vibrio cholerae (revision 1123125512)
|
||||
Phulwara oil (revision 1039287034)
|
||||
Neah Bay, Washington (revision 1117347476)
|
||||
Chelan County, Washington (revision 1115437018)
|
||||
Columbia River (revision 1121152264)
|
||||
Philippine Genome Center (revision 1086509191)
|
||||
Thirteen-lined ground squirrel (revision 1127159966)
|
||||
Cat massage (revision 1120597363)
|
||||
Swiss French (revision 1126844735)
|
||||
Probabilistic risk analysis (revision 1118087495)
|
||||
Kingdom (biology) (revision 1126766133)
|
||||
Norfloxacin (revision 1126442196)
|
||||
Tropical ground squirrel (revision 1121471157)
|
||||
Cannabis culture (revision 1123260879)
|
||||
Fontarrón (revision 962928722)
|
||||
Heuristic algorithm (revision 1124780994)
|
||||
Spotted ground squirrel (revision 1122239672)
|
||||
Hand washing (revision 1126772691)
|
||||
Human skin (revision 1125889832)
|
||||
Slovenia (revision 1127365628)
|
||||
Australia Bioinformatics Resource (revision 1023592097)
|
||||
Utah prairie dog (revision 1125084849)
|
||||
Research center (revision 1122565049)
|
||||
Australian Wildlife Conservancy (revision 1126004200)
|
||||
Catholicism (revision 1126878543)
|
||||
White-tailed prairie dog (revision 1121472368)
|
||||
Rabbit (revision 1125928365)
|
||||
Cathedral (revision 1117971650)
|
||||
Columbia Plateau (revision 1111592488)
|
||||
Pablo de Olavide University (revision 1100528254)
|
||||
Plant habit (revision 1101707375)
|
||||
Anti-fascism (revision 1126769811)
|
||||
Coral-billed ground-cuckoo (revision 1119603104)
|
||||
Alpine marmot (revision 1121471662)
|
||||
Homozygous (revision 1125746174)
|
||||
COVID-19 vaccination in the Republic of Ireland (revision 1125658338)
|
||||
Music of North Korea (revision 1109275365)
|
||||
Eastern Washington (revision 1111432324)
|
||||
Tarbagan marmot (revision 1121488248)
|
||||
VIAF (identifier) (revision 1122669300)
|
||||
Duke of Florence (revision 1010655117)
|
||||
Accademia della Crusca (revision 1118884925)
|
||||
Mobile robot (revision 1125548051)
|
||||
Hyperlocal (revision 1116240164)
|
||||
Oregon Trail (revision 1124389602)
|
||||
Cane rat (revision 1089272788)
|
||||
Federal Way, Washington (revision 1122923555)
|
||||
Rubens (revision 1121190866)
|
||||
Pala d'Oro (revision 1072202795)
|
||||
Archduke Rainer of Austria (1895–1930) (revision 1081133439)
|
||||
Bioinformatics (revision 1125974897)
|
||||
Renal tubular acidosis (revision 1105330876)
|
||||
Brain morphometry (revision 1053832132)
|
||||
Ethnologue (revision 1127241433)
|
||||
OregonLive.com (revision 1114379550)
|
||||
Yangban (revision 1121415587)
|
||||
Belize Inlet (revision 982557553)
|
||||
Canebrake Ecological Reserve (revision 1121247294)
|
||||
Glycogen (revision 1110998630)
|
||||
Richardson's ground squirrel (revision 1122297225)
|
||||
Cluster analysis (revision 1116924542)
|
||||
Genomics (revision 1126520756)
|
||||
Spermophilus brevicauda (revision 1010428942)
|
||||
Endosperm (revision 1112721337)
|
||||
Relational database (revision 1116718100)
|
||||
Snow (revision 1126528822)
|
||||
Roadless area conservation (revision 1103267389)
|
||||
Minneapolis–Saint Paul (revision 1124710168)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 20:24:17.046830
|
||||
|
||||
59 characters appeared 2235074 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 11.901753588471792 %
|
||||
[ 1] Char a: 8.660205433914046 %
|
||||
[ 2] Char t: 8.534616750944265 %
|
||||
[ 3] Char i: 7.941079355761822 %
|
||||
[ 4] Char n: 7.5567520359504865 %
|
||||
[ 5] Char o: 7.4230651871034254 %
|
||||
[ 6] Char s: 6.903216627279455 %
|
||||
[ 7] Char r: 6.589625220462499 %
|
||||
[ 8] Char l: 4.254847937920624 %
|
||||
[ 9] Char h: 4.180219536355396 %
|
||||
[10] Char c: 3.813967680712137 %
|
||||
[11] Char d: 3.744797711395685 %
|
||||
[12] Char u: 2.734361367901018 %
|
||||
[13] Char m: 2.5771853638850435 %
|
||||
[14] Char p: 2.266099466952772 %
|
||||
[15] Char f: 2.170576902599198 %
|
||||
[16] Char g: 1.9969361193410151 %
|
||||
[17] Char b: 1.540888579080603 %
|
||||
[18] Char y: 1.515833480233764 %
|
||||
[19] Char w: 1.324385680295149 %
|
||||
[20] Char v: 1.0713739231900152 %
|
||||
[21] Char k: 0.5591761167639192 %
|
||||
[22] Char x: 0.22384046344774267 %
|
||||
[23] Char j: 0.18035197044930057 %
|
||||
[24] Char z: 0.16464779242208535 %
|
||||
[25] Char q: 0.12464911676302441 %
|
||||
|
||||
The first 26 characters have an accumulated ratio of 0.9995445340959629.
|
||||
The first 5 characters have an accumulated ratio of 0.4459440716504241.
|
||||
All characters whose order is over 18 have an accumulated ratio of 0.036484250633312364.
|
||||
|
||||
972 sequences found.
|
||||
|
||||
First 373 (typical positive ratio): 0.9950190506759622
|
||||
Next 160 (533-373): 0.003986976910237083
|
||||
Rest: 0.0009939724138007255
|
||||
|
||||
- Processing end: 2022-12-14 20:24:17.102402
|
||||
249
script/BuildLangModelLogs/LangEsperantoModel.log
Normal file
249
script/BuildLangModelLogs/LangEsperantoModel.log
Normal file
@ -0,0 +1,249 @@
|
||||
= Logs of language model for Esperanto (eo) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 23:50:58.547656
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Vikipedio:Ĉefpaĝo (revision 7731360)
|
||||
Vikipedio (revision 7803852)
|
||||
1920-aj jaroj (revision 7836874)
|
||||
Slovaka Vikipedio (revision 6973132)
|
||||
Ĉinio (revision 7795007)
|
||||
Ekstremdekstro (revision 7547871)
|
||||
Heinrich XIII. Prinz Reuß (revision 7830303)
|
||||
Esperanto kaj Libera Scio (revision 7359235)
|
||||
Kino (revision 7770288)
|
||||
Aktorino (revision 7281724)
|
||||
Kosmoteleskopo James Webb (revision 7757099)
|
||||
Grandduklando Flandrensis (revision 7775141)
|
||||
Puĉo (revision 7290643)
|
||||
Komunisma Partio de Ĉinio (revision 7461976)
|
||||
Het Belang van Limburg (revision 4997347)
|
||||
Vikipedio en Esperanto (revision 7713198)
|
||||
Norveglingva (nynorsk) Vikipedio (revision 7325739)
|
||||
Jerry Muelver (revision 7249604)
|
||||
Forkosma Komunikada Centro de Goldstone (revision 7577784)
|
||||
Germana Regno (revision 7699806)
|
||||
Libera verko (revision 7586831)
|
||||
Portugallingva Vikipedio (revision 6932114)
|
||||
Reuß jüngerer Linie (revision 7830307)
|
||||
Pingveno (revision 7794992)
|
||||
2-a jarmilo (revision 7120518)
|
||||
Reĝisoro (revision 7661754)
|
||||
Elektra ŝargo (revision 7387538)
|
||||
Katolikismo (revision 6871587)
|
||||
Ernst Toller (revision 7359493)
|
||||
Nacia Biblioteko de la Ĉeĥa Respubliko (revision 7647079)
|
||||
Pupisto (revision 7327145)
|
||||
Sankt-Peterburgo (revision 7750523)
|
||||
Filmanonco (revision 7174791)
|
||||
Sofoklo (revision 7340998)
|
||||
Vicki Baum (revision 7597010)
|
||||
Televido (revision 7349603)
|
||||
Kosma observatorio (revision 6737616)
|
||||
Kuvajto (revision 7792727)
|
||||
Germanlingva Vikipedio (revision 7774975)
|
||||
Slobodan Miloŝeviĉ (revision 6579621)
|
||||
Orienta Timoro (revision 7695303)
|
||||
Entreprenisto (revision 7306596)
|
||||
Heliumo (revision 7456668)
|
||||
Asteroido (revision 7812722)
|
||||
Dolĉa akvo (revision 7491168)
|
||||
Suno (revision 7747486)
|
||||
Aŭdvida (revision 6466227)
|
||||
Turklingva Vikipedio (revision 6846032)
|
||||
Fotilo (revision 7489706)
|
||||
Esperanto (revision 7830834)
|
||||
Iluzio (revision 7786746)
|
||||
Statisto (revision 6941563)
|
||||
Kinematiko (revision 7131316)
|
||||
Mustafa Kemal Atatürk (revision 7644360)
|
||||
Konstitucia monarkio (revision 6055847)
|
||||
Bulgarlingva Vikipedio (revision 6735806)
|
||||
Philippe Séguin (revision 5881191)
|
||||
Avtandil Abuladze (revision 7075837)
|
||||
Esĥilo (revision 7320387)
|
||||
4-a de aprilo (revision 7527941)
|
||||
Trinkaĵo (revision 7708948)
|
||||
Vikinovaĵoj (revision 7754938)
|
||||
Urdulingva Vikipedio (revision 6196040)
|
||||
Maurice Ravel (revision 6885514)
|
||||
Jarcento (revision 7039755)
|
||||
Huajbej (revision 6345507)
|
||||
Kantonlingva Vikipedio (revision 7317513)
|
||||
Kinarto de Alĝerio (revision 6569472)
|
||||
Tatarlingva Vikipedio (revision 7316068)
|
||||
Ondolongo (revision 7262547)
|
||||
Moĉio (revision 6923813)
|
||||
Germano (revision 7722099)
|
||||
Slovaklingva Vikipedio (revision 6973132)
|
||||
2013 (revision 7802302)
|
||||
Bundestag (revision 7751502)
|
||||
Novembra Revolucio (Germanio) (revision 7608301)
|
||||
Jet Propulsion Laboratory (revision 7010867)
|
||||
Zvolen (revision 7153439)
|
||||
Jimmy Wales (revision 7720893)
|
||||
Siĉŭano (revision 7217276)
|
||||
Subtropiko (revision 7321658)
|
||||
Politiko (revision 7779971)
|
||||
Fakterminaro de Kino (revision 7287089)
|
||||
Arnold Schönberg (revision 7742973)
|
||||
Nobelaro (revision 7261563)
|
||||
Figuranto (revision 6941563)
|
||||
Musko (revision 7770056)
|
||||
Alfred Döblin (revision 7437098)
|
||||
Dublado (revision 7737429)
|
||||
Josif Stalin (revision 7836369)
|
||||
Slovakio (revision 7790988)
|
||||
Akademio de la Kinaj Artoj kaj Sciencoj (revision 7764611)
|
||||
2008 (revision 7534573)
|
||||
Kanbero (revision 7791558)
|
||||
Germana lingvo (revision 7774973)
|
||||
Kosmoteleskopo Herschel (revision 7444275)
|
||||
Heinrich Mann (revision 7390862)
|
||||
Kopio (revision 7252771)
|
||||
Hispanlingva Vikipedio (revision 7462754)
|
||||
Nord-Ĉinia Altebenaĵo (revision 7733540)
|
||||
Eŭroposkeptikismo (revision 7231971)
|
||||
Esperanta Vikipedio (revision 7713198)
|
||||
TTT (revision 7833858)
|
||||
Terorismo (revision 7377262)
|
||||
Flago de Belgio (revision 7363575)
|
||||
Imitulo (revision 7124671)
|
||||
Stuntado (revision 7542167)
|
||||
Larry Sanger (revision 7626559)
|
||||
Norda Irlando (revision 7564590)
|
||||
Usono (revision 7790633)
|
||||
Westarctica (revision 7390483)
|
||||
Viliam Búr (revision 7225353)
|
||||
Nederlando (revision 7792134)
|
||||
Ĉeĥa lingvo (revision 7278803)
|
||||
7-a de decembro (revision 7830767)
|
||||
Frekvenco (revision 7790333)
|
||||
Sovetunio (revision 7794329)
|
||||
Wayback Machine (revision 7758172)
|
||||
Leono (revision 7769297)
|
||||
Bolŝevismo (revision 7752865)
|
||||
Baden-Virtembergo (revision 7744185)
|
||||
Sindikato (revision 7492693)
|
||||
Meta-Vikio (revision 7754942)
|
||||
Angla lingvo (revision 7562773)
|
||||
Cebulingva Vikipedio (revision 7353232)
|
||||
Socia movado (revision 7538684)
|
||||
Mao Zedong (revision 7760287)
|
||||
Indonezio (revision 7773463)
|
||||
Provincoj de Ĉinio (revision 6832498)
|
||||
Vjetnamlingva Vikipedio (revision 6932115)
|
||||
Finnlingva Vikipedio (revision 5571951)
|
||||
4chan (revision 7121572)
|
||||
Antikva Ateno (revision 7219870)
|
||||
Facebook (revision 7653928)
|
||||
Michal Matúšov (revision 7462125)
|
||||
2091 (revision 6758032)
|
||||
Verkisto (revision 7755843)
|
||||
Universala Kongreso de Esperanto (revision 7761888)
|
||||
Jamendo (revision 7426245)
|
||||
19-a jarcento (revision 7811275)
|
||||
Norma lingvo (revision 7218839)
|
||||
Slovianski (revision 7598109)
|
||||
Kartografio (revision 7710475)
|
||||
Nederlandlingva Vikipedio (revision 6926616)
|
||||
2002 (revision 7534586)
|
||||
Individuo (revision 7597443)
|
||||
Idlingva Vikipedio (revision 6068998)
|
||||
1915 (revision 7803173)
|
||||
Jozefo la 2-a (revision 7250267)
|
||||
Torfejo (revision 6683962)
|
||||
MediaVikio (revision 7754927)
|
||||
22-a jarcento (revision 6730856)
|
||||
2007 (revision 7534575)
|
||||
Karl Benz (revision 6805860)
|
||||
Vikivortaro (revision 7754939)
|
||||
Nova Plena Ilustrita Vortaro de Esperanto (revision 7786706)
|
||||
Kontraŭpsikiatrio (revision 7538682)
|
||||
Vjetnama milito (revision 7738668)
|
||||
Archive.today (revision 7776700)
|
||||
Konstitucio (revision 7765516)
|
||||
Franca respublika kalendaro (revision 7243862)
|
||||
Datumprilaboro (revision 7577042)
|
||||
Ĉeĥoslovakio (revision 7794537)
|
||||
ETA (revision 7339003)
|
||||
Anglalingva Vikipedio (revision 5893724)
|
||||
Julio de 2013 (revision 6721942)
|
||||
Remorko (revision 7420645)
|
||||
VIAF (revision 6620334)
|
||||
Al-Kaida (revision 7704028)
|
||||
Lingvo (revision 7830520)
|
||||
Germanaj kolonioj (revision 7762083)
|
||||
1960 (revision 7724191)
|
||||
RepRap (revision 5112367)
|
||||
Mohandas Karamchand Gandhi (revision 7720626)
|
||||
Flago de Hispanio (revision 5126776)
|
||||
Komuna Norma Datumaro (revision 7708004)
|
||||
LIBRIS (revision 5939936)
|
||||
Skatolfotilo (revision 5684184)
|
||||
Helporeĝisoro (revision 7193195)
|
||||
Antarkta traktato (revision 7832483)
|
||||
Asturlingva Vikipedio (revision 7316066)
|
||||
Heidelberg (revision 7591269)
|
||||
Duolingo (revision 7583415)
|
||||
Tajvano (revision 7792823)
|
||||
Voĉdono (revision 7550281)
|
||||
Ĉefurbo (revision 7147438)
|
||||
Planedo (revision 7731794)
|
||||
Francio (revision 7793270)
|
||||
Hookeriales (revision 6948596)
|
||||
Nupedia (revision 7407943)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 23:54:19.220307
|
||||
|
||||
62 characters appeared 930894 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 12.552771851574937 %
|
||||
[ 1] Char o: 9.422447668585253 %
|
||||
[ 2] Char e: 9.31846160787372 %
|
||||
[ 3] Char i: 8.629983650125578 %
|
||||
[ 4] Char n: 7.460247890737291 %
|
||||
[ 5] Char r: 6.033232569981116 %
|
||||
[ 6] Char l: 5.973182768392535 %
|
||||
[ 7] Char t: 5.8455635120647464 %
|
||||
[ 8] Char s: 5.221002606096935 %
|
||||
[ 9] Char k: 4.088542841612472 %
|
||||
[10] Char d: 3.5726946354794427 %
|
||||
[11] Char j: 3.528973223589367 %
|
||||
[12] Char u: 2.914295290333808 %
|
||||
[13] Char p: 2.788072541019708 %
|
||||
[14] Char m: 2.7140576692942484 %
|
||||
[15] Char v: 1.6276826362614862 %
|
||||
[16] Char g: 1.6170476982341708 %
|
||||
[17] Char c: 1.2713585005381924 %
|
||||
[18] Char f: 1.0721951156630078 %
|
||||
[19] Char b: 1.0565112676631283 %
|
||||
[20] Char h: 0.6184377598308722 %
|
||||
[21] Char z: 0.5728901464613586 %
|
||||
[22] Char ĝ: 0.54367092279035 %
|
||||
[23] Char ŭ: 0.49189273966745944 %
|
||||
[24] Char ĉ: 0.3883363734216785 %
|
||||
[25] Char ŝ: 0.15683847999879685 %
|
||||
[26] Char w: 0.12633017293053775 %
|
||||
[27] Char y: 0.12493366591684982 %
|
||||
[28] Char ĵ: 0.09356596991709044 %
|
||||
[29] Char ĥ: 0.030615730684696644 %
|
||||
|
||||
The first 30 characters have an accumulated ratio of 0.9985583750674084.
|
||||
The first 5 characters have an accumulated ratio of 0.4738391266889678.
|
||||
All characters whose order is over 19 have an accumulated ratio of 0.0314751196161969.
|
||||
|
||||
1142 sequences found.
|
||||
|
||||
First 480 (typical positive ratio): 0.9950183309028746
|
||||
Next 266 (746-480): 0.003983484339076138
|
||||
Rest: 0.0009981847580492476
|
||||
|
||||
- Processing end: 2022-12-14 23:54:19.299182
|
||||
232
script/BuildLangModelLogs/LangEstonianModel.log
Normal file
232
script/BuildLangModelLogs/LangEstonianModel.log
Normal file
@ -0,0 +1,232 @@
|
||||
= Logs of language model for Estonian (et) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 23:51:10.973727
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Harilik pohl (revision 6214729)
|
||||
Okasmets (revision 4634930)
|
||||
Kajakas (revision 5877896)
|
||||
Hendrik Relve (revision 6239649)
|
||||
Euraasia (revision 5979562)
|
||||
Klass (bioloogia) (revision 3489567)
|
||||
Põhja-Ameerika (revision 6057090)
|
||||
Leesikas (revision 6078259)
|
||||
Nõmm (revision 5979536)
|
||||
Juurestik (revision 3341159)
|
||||
Ida-Euroopa (revision 6093192)
|
||||
Tuhk (revision 5757120)
|
||||
Laanemets (revision 6263946)
|
||||
Antarktis (revision 6241944)
|
||||
Juurekael (revision 5994523)
|
||||
Kuldmikrofon (revision 6268376)
|
||||
Maailmajagu (revision 5713103)
|
||||
Masuut (revision 5343618)
|
||||
Eesti Maaülikool (revision 6242625)
|
||||
Maa (planeet) (revision 6267326)
|
||||
Melaneesia (revision 3521152)
|
||||
Kattekold (revision 5583465)
|
||||
Vingugaas (revision 6037399)
|
||||
Regioon (revision 6001414)
|
||||
1948 (revision 5944400)
|
||||
Mineraal (revision 6017794)
|
||||
1967 (revision 6260590)
|
||||
Lõuna-Aafrika (revision 5329241)
|
||||
Kultuur (revision 5849872)
|
||||
Leseleht (revision 5411820)
|
||||
Jääkajakas (revision 5726751)
|
||||
Ida-Siber (revision 5500451)
|
||||
Küttepuud (revision 6212731)
|
||||
Põhja-Euroopa (revision 6205189)
|
||||
Harilik ussilakk (revision 6126452)
|
||||
Euroopa (revision 6164900)
|
||||
Metsatulekahju (revision 6201635)
|
||||
Lõuna-Ameerika manner (revision 5969300)
|
||||
Imetajad (revision 6259823)
|
||||
Männik (revision 5798754)
|
||||
Tallinna Reaalkool (revision 6203404)
|
||||
Rahva Raamat (revision 6223805)
|
||||
Tiiu Relve (revision 6140880)
|
||||
Sinilill (revision 6078788)
|
||||
Kivisüsi (revision 6130479)
|
||||
Karbuskajakas (revision 5726756)
|
||||
Harilik mänd (revision 6214719)
|
||||
Vaskvitriol (revision 5293135)
|
||||
Elu (revision 5979638)
|
||||
Eesti Metsakorralduskeskus (revision 6147611)
|
||||
Edela-Euroopa (revision 5491146)
|
||||
Uus-Hispaania asekuningriik (revision 5973763)
|
||||
Kanarbik (revision 6160743)
|
||||
Pruunsüsi (revision 5536362)
|
||||
Alamklass (bioloogia) (revision 5129886)
|
||||
Tõmmukajakas (revision 6206529)
|
||||
Lääne-Siber (revision 5060366)
|
||||
Põhja-Ameerika manner (revision 5482041)
|
||||
Vaikne ookean (revision 5614735)
|
||||
Triibus (revision 5827255)
|
||||
Võrse (revision 6248963)
|
||||
Vaarikas (revision 6118808)
|
||||
Äraspidimunajas leht (revision 5015991)
|
||||
Magma (revision 5876846)
|
||||
Farmakopöa (revision 4639831)
|
||||
Perekond (bioloogia) (revision 6200916)
|
||||
Kanada (revision 6239766)
|
||||
Polüneesia (revision 4031998)
|
||||
Tuhaplokk (revision 6141202)
|
||||
Ladina keel (revision 6173071)
|
||||
Kaug-Ida (revision 5644298)
|
||||
Põhjavesi (revision 6035090)
|
||||
Alamliik (revision 5278935)
|
||||
Kuslapuu (revision 6147930)
|
||||
Põhja-Aasia (revision 5161593)
|
||||
Mikroneesia (revision 5131008)
|
||||
Tartu Ülikool (revision 6267789)
|
||||
Aafrika (revision 6267646)
|
||||
Öland (revision 6176389)
|
||||
Deliirium (revision 5734894)
|
||||
Siberi seedermänd (revision 6220739)
|
||||
Manner (revision 5960299)
|
||||
Salumetsad (revision 6138640)
|
||||
Aasia (revision 5829266)
|
||||
Kagu-Euroopa (revision 5990556)
|
||||
Piirkond (revision 6001414)
|
||||
Austraalia ja Okeaania (revision 4983417)
|
||||
Albaania (revision 6257268)
|
||||
Arumetsad (revision 4779035)
|
||||
Vulkaaniline tuhk (revision 5623096)
|
||||
Tahm (revision 5293173)
|
||||
Tsüstiit (revision 6141175)
|
||||
Rumeenia (revision 6177876)
|
||||
Ülemklass (revision 5432535)
|
||||
Lehis (revision 6132033)
|
||||
Tee (jook) (revision 6223529)
|
||||
Põhja-Ameerika inglise keel (revision 5408982)
|
||||
Leheroots (revision 6183341)
|
||||
Keelikloomad (revision 5939581)
|
||||
Austraalia ja Uus-Meremaa (revision 6145734)
|
||||
Nulg (revision 5558808)
|
||||
Austraalia manner (revision 5442365)
|
||||
Kuusk (revision 6267330)
|
||||
Liik (bioloogia) (revision 6203064)
|
||||
Maailmameri (revision 6100027)
|
||||
Taksaator (revision 4477687)
|
||||
Mets (revision 6120783)
|
||||
Tallinn (revision 6260994)
|
||||
Harilik pihlakas (revision 6020517)
|
||||
Turvas (revision 6166592)
|
||||
USGS (revision 6227151)
|
||||
Euraasia laam (revision 5375994)
|
||||
Ruutkilomeeter (revision 5300255)
|
||||
18. detsember (revision 6178181)
|
||||
Põhja-Jäämeri (revision 6099723)
|
||||
2015 (revision 6094792)
|
||||
Konvektsioon (revision 5825119)
|
||||
Binaarne nomenklatuur (revision 5719069)
|
||||
18. sajand Eestis (revision 6110372)
|
||||
Männas (revision 3543693)
|
||||
19. sajand (revision 5890985)
|
||||
1883 (revision 5887052)
|
||||
Harilik porss (revision 5411929)
|
||||
Jupiter (revision 6238990)
|
||||
Kuriili lehis (revision 5411660)
|
||||
Vesinik (revision 5931159)
|
||||
Kurvitsalised (revision 5855153)
|
||||
Palumetsad (revision 6262845)
|
||||
Pruunvetikad (revision 6201643)
|
||||
Polaartelg (revision 5550654)
|
||||
Kultuurimaja (revision 5922863)
|
||||
Antratsiit (revision 6156512)
|
||||
Viljandi (revision 6205977)
|
||||
Sete (revision 5976908)
|
||||
Ameerika (revision 6056843)
|
||||
Väike mandlipuu (revision 5397046)
|
||||
Geoloogiline aeg (revision 216025)
|
||||
Tarn (revision 6115293)
|
||||
Polaaralad (revision 6071951)
|
||||
Lääne-Aafrika (revision 5329227)
|
||||
1908. aasta suveolümpiamängud (revision 5300207)
|
||||
1799 (revision 4947859)
|
||||
15. august (revision 6178254)
|
||||
Reiu männikud (revision 5814120)
|
||||
Süsinik (revision 5951019)
|
||||
Maksahaigused (revision 5398828)
|
||||
Merikajakas (revision 6220119)
|
||||
Munajas leht (revision 3012434)
|
||||
Kagu-Aasia (revision 5244191)
|
||||
Tihumeeter (revision 5916725)
|
||||
Ida-Eesti (revision 5944399)
|
||||
Kaitsestaatus (revision 5622492)
|
||||
Lõuna-Eesti (revision 5756065)
|
||||
Geen (revision 6160416)
|
||||
Skandinaavia poolsaar (revision 4991435)
|
||||
Indoneesia (revision 6229579)
|
||||
9. juuli (revision 5992613)
|
||||
Hiidmanner (revision 3493207)
|
||||
Tallinna Tehnikaülikool (revision 6267300)
|
||||
Kuur (revision 5304082)
|
||||
Liblikalised (revision 6026473)
|
||||
Albaania linnad (revision 5842285)
|
||||
Kristallstruktuur (revision 6188534)
|
||||
Põhja-Eesti (revision 6126391)
|
||||
Prantslased (revision 5984522)
|
||||
Akadi keel (revision 6144297)
|
||||
Muld (revision 5957717)
|
||||
Külmaseen (revision 6264761)
|
||||
Larus canus canus (revision 5855148)
|
||||
Hoovus (revision 5754496)
|
||||
Teiin (revision 5717293)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 23:54:01.496149
|
||||
|
||||
58 characters appeared 629128 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 12.73921999974568 %
|
||||
[ 1] Char e: 10.231463231647615 %
|
||||
[ 2] Char i: 10.16009460713877 %
|
||||
[ 3] Char s: 8.462188934525248 %
|
||||
[ 4] Char t: 6.624089215549141 %
|
||||
[ 5] Char l: 6.260729136201218 %
|
||||
[ 6] Char u: 5.553877748248369 %
|
||||
[ 7] Char n: 5.385549522513702 %
|
||||
[ 8] Char k: 4.786943197568698 %
|
||||
[ 9] Char o: 4.291972380819165 %
|
||||
[10] Char d: 3.9700982947826193 %
|
||||
[11] Char r: 3.892530613801961 %
|
||||
[12] Char m: 3.7164138299360383 %
|
||||
[13] Char v: 2.3497603031497563 %
|
||||
[14] Char p: 1.8603527422082626 %
|
||||
[15] Char g: 1.775314403428237 %
|
||||
[16] Char j: 1.7096679848933765 %
|
||||
[17] Char h: 1.5667717857097443 %
|
||||
[18] Char ä: 1.1379242379929044 %
|
||||
[19] Char õ: 0.9997965437875918 %
|
||||
[20] Char b: 0.9778614208873234 %
|
||||
[21] Char ü: 0.6551925840210577 %
|
||||
[22] Char f: 0.22761663763176967 %
|
||||
[23] Char c: 0.22666293663610582 %
|
||||
[24] Char ö: 0.2026614615785659 %
|
||||
[25] Char y: 0.06135476405437367 %
|
||||
[26] Char w: 0.054043056420950905 %
|
||||
[27] Char x: 0.031154232525018755 %
|
||||
[28] Char z: 0.024160425223483932 %
|
||||
[29] Char š: 0.02066352157271652 %
|
||||
[30] Char ž: 0.010172810620414289 %
|
||||
[31] Char q: 0.009219109624750449 %
|
||||
|
||||
The first 32 characters have an accumulated ratio of 0.9997552167444463.
|
||||
The first 4 characters have an accumulated ratio of 0.41592966773057316.
|
||||
All characters whose order is over 18 have an accumulated ratio of 0.035005595045841234.
|
||||
|
||||
876 sequences found.
|
||||
|
||||
First 431 (typical positive ratio): 0.9950077226033445
|
||||
Next 157 (588-431): 0.003997910901044732
|
||||
Rest: 0.000994366495610799
|
||||
|
||||
- Processing end: 2022-12-14 23:54:01.570903
|
||||
235
script/BuildLangModelLogs/LangFinnishModel.log
Normal file
235
script/BuildLangModelLogs/LangFinnishModel.log
Normal file
@ -0,0 +1,235 @@
|
||||
= Logs of language model for Finnish (fi) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 23:51:17.009255
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Yhdistynyt kuningaskunta (revision 21066772)
|
||||
Sherlock Holmesin seikkailut (televisiosarja) (revision 19345728)
|
||||
Englannin sisällissota (revision 20681585)
|
||||
Damien Hirst (revision 20254144)
|
||||
Coldplay (revision 20996509)
|
||||
Puola (revision 21098204)
|
||||
Vanguard-luokka (sukellusvene) (revision 20477212)
|
||||
Unkari (revision 21093822)
|
||||
Antigua ja Barbuda (revision 20834245)
|
||||
Eurotunneli (revision 20871791)
|
||||
Urdu (revision 21069477)
|
||||
Gibraltar (revision 21007055)
|
||||
Tuvalu (revision 20860615)
|
||||
Fix You (revision 21005448)
|
||||
Manchester (revision 20895719)
|
||||
Blur (revision 20771440)
|
||||
Jimmy Carter (revision 20860817)
|
||||
Arcade Fire (revision 21107055)
|
||||
Väli-Amerikka (revision 20603598)
|
||||
Charles Ramirez (revision 20660516)
|
||||
Kioa (revision 20880316)
|
||||
UEFA (revision 20678496)
|
||||
23. lokakuuta (revision 20918625)
|
||||
Sadrin kieli (revision 20941171)
|
||||
Antigua ja Barbudan lippu (revision 20650267)
|
||||
Kuusi Napoleonia (revision 18401994)
|
||||
Lontoon yliopisto (revision 18248559)
|
||||
Torpedoputki (revision 19291207)
|
||||
Tinapilli (revision 20621327)
|
||||
Englannin kieli (revision 20829497)
|
||||
Rolling Stone (revision 20937647)
|
||||
Luettelo valtioista väkiluvun mukaan (revision 21110123)
|
||||
Varjojen maat (revision 19455482)
|
||||
Arthur Conan Doyle (revision 20650922)
|
||||
Tuvalu mo te Atua (revision 20825014)
|
||||
Vatikaani (revision 21017236)
|
||||
Trinidad ja Tobago (revision 21082777)
|
||||
The Pretenders (revision 20700307)
|
||||
Mauritius (revision 21031101)
|
||||
Hindustani (revision 16713728)
|
||||
Napoleon I (revision 20998435)
|
||||
Bristol (revision 20657021)
|
||||
Astute-luokka (revision 17775821)
|
||||
Bronisław Komorowski (revision 20657167)
|
||||
Antiikintutkimus (revision 20815035)
|
||||
Latvia (revision 21109727)
|
||||
Luterilaisuus (revision 21108515)
|
||||
Margaret Thatcher (revision 20827653)
|
||||
Liikevaihto (revision 19244199)
|
||||
Kaarle II (Englanti) (revision 21022937)
|
||||
Lahna (revision 20718556)
|
||||
Pariisi (revision 21098083)
|
||||
Ohio-luokka (revision 20916657)
|
||||
Säätyläiset (revision 20954041)
|
||||
Karaatti (revision 20827014)
|
||||
Kaarle I (Englanti) (revision 21028904)
|
||||
Englanti (revision 21068035)
|
||||
Parlophone Records (revision 20332700)
|
||||
Postpositio (revision 19287247)
|
||||
Yhdysvaltain Neitsytsaaret (revision 20804420)
|
||||
Sherlock Holmes (revision 21038050)
|
||||
Iso-Britannia (revision 21066772)
|
||||
Arabialainen kirjaimisto (revision 20475019)
|
||||
Charles Saatchi (revision 14917996)
|
||||
Gibraltarin pääministeri (revision 16808956)
|
||||
Newcastle (revision 21050270)
|
||||
Väestötiheys (revision 20092734)
|
||||
Internet (revision 21025514)
|
||||
Marin tasavalta (revision 20896113)
|
||||
Teollisuus (revision 20956826)
|
||||
Doverinsalmi (revision 20369550)
|
||||
Boston (revision 20992854)
|
||||
Israelin kansalliskirjasto (revision 20854961)
|
||||
Gibtelecom (revision 21007077)
|
||||
Sindhi (revision 20948345)
|
||||
Unkarin sosialistinen työväenpuolue (revision 18743747)
|
||||
Kookosmaito (revision 21094033)
|
||||
Arabian kieli (revision 21060827)
|
||||
BRIT Awards (revision 19302405)
|
||||
Gotthardin pohjatunneli (revision 21061923)
|
||||
Mecsek (revision 20921164)
|
||||
British Airways (revision 20984077)
|
||||
Gawarin kieli (revision 13035766)
|
||||
ITV (revision 20415578)
|
||||
Megawatti (revision 20639645)
|
||||
Bangladesh (revision 21101257)
|
||||
HMS Victorious (S29) (revision 20088762)
|
||||
Kivennuoliainen (revision 20945527)
|
||||
Tuvalun dollari (revision 16801336)
|
||||
Irena Szewińska (revision 19753210)
|
||||
Ilja Leonard Pfeijffer (revision 21048419)
|
||||
Neitsyt Maria (revision 20896050)
|
||||
BBC Two (revision 20819614)
|
||||
Tanganjika (revision 20446073)
|
||||
Khowarin kieli (revision 19310691)
|
||||
Saint Lucia (revision 21065825)
|
||||
Bundelin kieli (revision 14167989)
|
||||
Lontoo (revision 20946337)
|
||||
Sinhali (revision 19311081)
|
||||
Johnstonin atolli (revision 18905507)
|
||||
Guinnessin ennätyskirja (revision 20839808)
|
||||
Montserrat (revision 21048411)
|
||||
Eurostar International (revision 20678739)
|
||||
Jamaika (revision 21055658)
|
||||
HMS Vanguard (S28) (revision 20088459)
|
||||
Yanito (revision 20355121)
|
||||
1965 (revision 20952728)
|
||||
BBC (revision 20873802)
|
||||
1984 (revision 21076882)
|
||||
Ravintola (revision 20579600)
|
||||
Vähittäiskauppa (revision 21059296)
|
||||
Krzysztof Komeda (revision 17942536)
|
||||
François Mitterrand (revision 20343193)
|
||||
Lublin (revision 19195589)
|
||||
Pitkäperjantai (revision 20423940)
|
||||
Johannes Paavali II (revision 21066870)
|
||||
Karibia (revision 20786667)
|
||||
7. kesäkuuta (revision 20953482)
|
||||
Deutsche Bahn (revision 21040025)
|
||||
Gibraltar Chronicle (revision 21007056)
|
||||
Alankomaat (revision 21066782)
|
||||
Englannin kuningaskunta (revision 20703315)
|
||||
Grand Hotel Europa (revision 20256757)
|
||||
Julkisen palvelun yleisradiotoiminta (revision 20950803)
|
||||
Gaston Browne (revision 20836659)
|
||||
Monaco (revision 20905943)
|
||||
Tokaji (revision 20197418)
|
||||
Csongrád (lääni) (revision 19494157)
|
||||
Nato (revision 21049954)
|
||||
Venetsian biennaali (revision 20900561)
|
||||
Yleisradiotoiminta (revision 20950803)
|
||||
Britannia (revision 21066772)
|
||||
Malawi (Commonwealth realm) (revision 20446067)
|
||||
Platina (revision 20315754)
|
||||
Permin Komi (revision 20926038)
|
||||
UGM-27 Polaris (revision 20627059)
|
||||
Yhdysvaltain dollari (revision 21093482)
|
||||
Samarkand (revision 20861839)
|
||||
Uganda (Commonwealth realm) (revision 20446074)
|
||||
Ympäristönsuojelu (revision 20650048)
|
||||
Guadeloupe (revision 20300349)
|
||||
Rehtori (revision 20935388)
|
||||
Joseph Bell (revision 20958309)
|
||||
Fejér (revision 15207333)
|
||||
OFC Nations Cup (revision 15982936)
|
||||
Metsäsuomalaiset (revision 21027788)
|
||||
Nicaragua (revision 21069170)
|
||||
Westminsterin palatsi (revision 20640014)
|
||||
Sosiaalihistoria (revision 20334015)
|
||||
Gabriel (revision 21006459)
|
||||
Pikkuviha (revision 20526971)
|
||||
Virtual International Authority File (revision 21019677)
|
||||
Uusi-Guinea (revision 20516634)
|
||||
Kuuba (revision 21030857)
|
||||
Sanya Richards-Ross (revision 20944016)
|
||||
Säätiö (revision 20613246)
|
||||
Dover (Englanti) (revision 20827636)
|
||||
Musiikki (revision 20907775)
|
||||
Suomen kieli (revision 21076647)
|
||||
Murre (revision 20584718)
|
||||
Ensimmäinen maailmansota (revision 21038559)
|
||||
Edam (juusto) (revision 19473248)
|
||||
Henrik VIII (Englanti) (revision 21014997)
|
||||
Christopher Wren (revision 20136409)
|
||||
Mongolia (revision 20916298)
|
||||
Al-Andalus (revision 21099577)
|
||||
Vesa-Pekka Rannikko (revision 20633828)
|
||||
Devanagari (revision 20666572)
|
||||
1940 (revision 20989523)
|
||||
Kookospähkinä (revision 21028626)
|
||||
SBB-CFF-FFS (revision 20940583)
|
||||
Kaupunkivaltio (revision 21025690)
|
||||
Murmanskin alue (revision 21049065)
|
||||
Jean-Claude Juncker (revision 20875578)
|
||||
Tim Berners-Lee (revision 20522743)
|
||||
Armenia (revision 20987696)
|
||||
Békés (lääni) (revision 19786715)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 23:54:09.799129
|
||||
|
||||
76 characters appeared 1536977 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 12.614697552403193 %
|
||||
[ 1] Char i: 11.0243029010844 %
|
||||
[ 2] Char t: 8.821992781934929 %
|
||||
[ 3] Char n: 8.82095177741762 %
|
||||
[ 4] Char e: 7.688859364844107 %
|
||||
[ 5] Char s: 7.632775246474085 %
|
||||
[ 6] Char l: 6.030083729294583 %
|
||||
[ 7] Char o: 5.542438175717659 %
|
||||
[ 8] Char u: 5.201379070734305 %
|
||||
[ 9] Char k: 4.7905726630912495 %
|
||||
[10] Char r: 3.1331633459706945 %
|
||||
[11] Char m: 3.0228168671359428 %
|
||||
[12] Char ä: 3.0008256467077907 %
|
||||
[13] Char v: 2.2698452872098933 %
|
||||
[14] Char j: 1.9956056596813094 %
|
||||
[15] Char p: 1.7456995127448232 %
|
||||
[16] Char h: 1.7247492968339802 %
|
||||
[17] Char y: 1.5948189205173533 %
|
||||
[18] Char d: 1.110296380492356 %
|
||||
[19] Char g: 0.48133446369073835 %
|
||||
[20] Char b: 0.4582371759629455 %
|
||||
[21] Char ö: 0.40117711585794713 %
|
||||
[22] Char c: 0.3493220783394937 %
|
||||
[23] Char f: 0.20696471059749108 %
|
||||
[24] Char w: 0.14886364597518376 %
|
||||
[25] Char z: 0.06128914095656604 %
|
||||
[26] Char x: 0.028497498661333255 %
|
||||
[27] Char é: 0.01450900045999387 %
|
||||
[28] Char q: 0.013858372636675761 %
|
||||
|
||||
The first 29 characters have an accumulated ratio of 0.9992992738342864.
|
||||
The first 4 characters have an accumulated ratio of 0.4128194501284014.
|
||||
All characters whose order is over 17 have an accumulated ratio of 0.03274349583630724.
|
||||
|
||||
1146 sequences found.
|
||||
|
||||
First 417 (typical positive ratio): 0.9950442901604022
|
||||
Next 226 (643-417): 0.003959181230548281
|
||||
Rest: 0.0009965286090495296
|
||||
|
||||
- Processing end: 2022-12-14 23:54:09.877879
|
||||
@ -1,188 +1,263 @@
|
||||
= Logs of language model for French (fr) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2015-11-30 18:53:23.881008
|
||||
- Maximum depth: 2
|
||||
- Max number of pages: 10
|
||||
- Started: 2022-12-14 20:35:28.078254
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Wikipédia:Accueil_principal (revision 115957655)
|
||||
Bœuf (animal) (revision 115500130)
|
||||
1672 (revision 120907902)
|
||||
1727 (revision 120908296)
|
||||
24 novembre (revision 120782024)
|
||||
26 novembre (revision 120833172)
|
||||
27 novembre (revision 120860032)
|
||||
28 novembre (revision 120900893)
|
||||
30 novembre (revision 120934923)
|
||||
Amsterdam (revision 120834895)
|
||||
Amérique (revision 120916912)
|
||||
An mil (revision 120416538)
|
||||
Ancien Régime (revision 120708739)
|
||||
Anjou (revision 120590957)
|
||||
António Costa (revision 120928729)
|
||||
Armée de l'air turque (revision 120764207)
|
||||
Artémise II (revision 120920820)
|
||||
Attentat du 24 novembre 2015 à Tunis (revision 120924574)
|
||||
Barbro Hiort af Ornäs (revision 120933311)
|
||||
Bataille d’Attu (revision 120942542)
|
||||
Bretagne (revision 120828180)
|
||||
Candé (revision 120928722)
|
||||
Canton de Candé (revision 120383860)
|
||||
Chef-lieu (revision 119340707)
|
||||
Chouannerie (revision 119799524)
|
||||
Commune (France) (revision 120627882)
|
||||
Conférence de Paris de 2015 sur le climat (revision 120944002)
|
||||
Crise de la dette publique grecque (revision 120905208)
|
||||
Crise entre la Colombie et le Venezuela de 2015 (revision 120857143)
|
||||
Crise migratoire en Europe (revision 120906358)
|
||||
Crise russo-turque de 2015 (revision 120936864)
|
||||
Deuxième guerre civile libyenne (revision 120673125)
|
||||
Déesse mère (revision 120904195)
|
||||
Départements français (revision 120873309)
|
||||
Effet Shapiro (revision 120893782)
|
||||
Eldar Riazanov (revision 120924339)
|
||||
Fatima Mernissi (revision 120942794)
|
||||
Français (revision 120883858)
|
||||
Gerry Byrne (football) (revision 120943526)
|
||||
Guerre civile sud-soudanaise (revision 120672963)
|
||||
Guerre civile syrienne (revision 120868598)
|
||||
Guerre d'Afghanistan (depuis 2015) (revision 120675052)
|
||||
Guerre du Donbass (revision 120862085)
|
||||
Guerre du Yémen (depuis 2001) (revision 118472483)
|
||||
Insurrection djihadiste au Nigeria (revision 120550223)
|
||||
Irwin Shapiro (revision 116730530)
|
||||
Ismaïl ben Chérif (revision 120930731)
|
||||
Ivan Hlevnjak (revision 120917619)
|
||||
Jean Corti (revision 120935599)
|
||||
Jean Joubert (revision 120924134)
|
||||
Karashima Noboru (revision 120892854)
|
||||
Latin (revision 120360207)
|
||||
Luc Bondy (revision 120941142)
|
||||
Maine-et-Loire (revision 120890165)
|
||||
Marches de Bretagne (revision 115772332)
|
||||
Mark Behr (revision 120943649)
|
||||
Maroc (revision 120937137)
|
||||
Maurice Strong (revision 120927161)
|
||||
Mausole (revision 120904648)
|
||||
Moyen Âge (revision 120943615)
|
||||
Novembre 2015 (revision 120866496)
|
||||
Olene S. Walker (revision 120927070)
|
||||
Paternité (revision 119371049)
|
||||
Pays de la Loire (revision 120719853)
|
||||
Philippe II Auguste (revision 120910593)
|
||||
Philippe Washer (revision 120939362)
|
||||
Premier ministre de Portugal (revision 120888501)
|
||||
Relativité générale (revision 120814809)
|
||||
Régions françaises (revision 120692851)
|
||||
Seconde Guerre mondiale (revision 120884001)
|
||||
Seconde guerre civile irakienne (revision 120893282)
|
||||
Shigeru Mizuki (revision 120931351)
|
||||
Soukhoï Su-24 (revision 120892538)
|
||||
Spuistraat (revision 119667601)
|
||||
Syrie (revision 120692724)
|
||||
Tahir Elçi (revision 120942499)
|
||||
Tunis (revision 120628797)
|
||||
Vague de violence israélo-palestinienne de l'automne 2015 (revision 120927782)
|
||||
Wiki (revision 120671138)
|
||||
Wikimedia Foundation (revision 120519147)
|
||||
Wikipédia en français (revision 120692561)
|
||||
XVIIIe siècle (revision 119843235)
|
||||
XVIIe siècle (revision 120773755)
|
||||
Église de Jésus-Christ des saints des derniers jours (revision 120924507)
|
||||
Agriculture (revision 120943777)
|
||||
Anesthésie (revision 120319446)
|
||||
Animal de trait (revision 120819989)
|
||||
Bien-être animal (revision 120205455)
|
||||
Bière (revision 119961318)
|
||||
Bos taurus (revision 119683704)
|
||||
Bête de somme (revision 117842569)
|
||||
Bœuf Gras (revision 119942055)
|
||||
Bœuf de Kobe (revision 120829709)
|
||||
Castration (revision 119751330)
|
||||
Chapon (revision 114928344)
|
||||
Charrette (revision 120909407)
|
||||
Charrue (revision 120819690)
|
||||
Colonisation (revision 120146837)
|
||||
Edme Gaulle (revision 118241504)
|
||||
Europe de l'Ouest (revision 120854797)
|
||||
Géant-Bœuf du Carnaval de Paris (revision 118480900)
|
||||
Hongre (revision 120607208)
|
||||
Hypoxie (revision 118470557)
|
||||
Japon (revision 120742182)
|
||||
Labour (revision 120144019)
|
||||
Marché des Blancs-Manteaux (revision 106807185)
|
||||
Monde musulman (revision 120793714)
|
||||
Mâle (biologie) (revision 111721849)
|
||||
Mésopotamie (revision 120642895)
|
||||
Promenade du Bœuf Gras au Carnaval de Paris (revision 120874240)
|
||||
Rue des Hospitalières-Saint-Gervais (revision 107834996)
|
||||
Takayama (revision 118810594)
|
||||
Taureau (revision 120459397)
|
||||
Testicule (revision 120432335)
|
||||
Testostérone (revision 119909685)
|
||||
Traction animale (revision 120819989)
|
||||
Traction bovine (revision 111651361)
|
||||
Traîneau (revision 120604907)
|
||||
Viande (revision 120600247)
|
||||
Viande bovine (revision 119480442)
|
||||
Wagyu (revision 120910460)
|
||||
XXe siècle (revision 120793535)
|
||||
Élevage bovin (revision 120877235)
|
||||
Bœuf (animal) (revision 197762352)
|
||||
Hongre (revision 192403538)
|
||||
Mésopotamie (revision 199078207)
|
||||
Canada (revision 199464527)
|
||||
Promenade du Bœuf Gras au Carnaval de Paris (revision 198921694)
|
||||
Labour (revision 196168038)
|
||||
Testicule (revision 199482540)
|
||||
Taureau (revision 197805532)
|
||||
Bien-être animal (revision 197276347)
|
||||
Traîneau (revision 192374700)
|
||||
Bœuf de Kobe (revision 192878601)
|
||||
Marché des Blancs-Manteaux (revision 180317195)
|
||||
Art rupestre du Valcamonica (revision 194978083)
|
||||
Arme à feu (revision 197634282)
|
||||
Inondation (revision 197820876)
|
||||
Élevage équin (revision 193185231)
|
||||
Turquie (revision 199420925)
|
||||
Paronymie (revision 199270951)
|
||||
Parlement de Paris (revision 199191505)
|
||||
Animal domestique en droit français (revision 195719387)
|
||||
Veau d'or (revision 196918081)
|
||||
Appareil génital féminin (revision 198910395)
|
||||
Tendreté (revision 194393321)
|
||||
Hygiène (revision 196930859)
|
||||
Apis (revision 197792729)
|
||||
Ordre des serviteurs de la Sainte Vierge (revision 183370520)
|
||||
Neige (revision 199283662)
|
||||
Alcurrucén (revision 196358906)
|
||||
Période Yayoi (revision 196966397)
|
||||
Baltasar Ibán (revision 165418210)
|
||||
Care (gemmage) (revision 179398144)
|
||||
Néoplatonisme (revision 199045013)
|
||||
INSL3 (revision 194404910)
|
||||
Monarchie lucienne (revision 197410562)
|
||||
Ordre mendiant (revision 199436160)
|
||||
Rate (revision 199394518)
|
||||
Marché couvert (revision 192688251)
|
||||
Baleine (revision 199327939)
|
||||
Le Cheval (nouvelle) (revision 193840091)
|
||||
Le Marais (quartier parisien) (revision 199292776)
|
||||
Vous avez deux vaches (revision 194794493)
|
||||
Jean-Marie Pourquier (revision 184883025)
|
||||
Palha (revision 197762586)
|
||||
Jacques Hillairet (revision 198554519)
|
||||
Parasitisme (revision 197447846)
|
||||
Viande (revision 199230345)
|
||||
Fourche (revision 198888946)
|
||||
Québec bashing (revision 199195942)
|
||||
Préfecture de police (revision 194841953)
|
||||
Musée national des arts et traditions populaires (Paris) (revision 197374487)
|
||||
Samoyède (chien) (revision 195852684)
|
||||
Hippolyte Bayard (revision 197131584)
|
||||
Lazy bed (revision 129995180)
|
||||
Traction bovine (revision 179198458)
|
||||
Bile d'ours (revision 196853037)
|
||||
Agriculture de conservation (revision 197781198)
|
||||
Viande bovine (revision 198200168)
|
||||
Phénotype (revision 198078277)
|
||||
Castration (revision 199351966)
|
||||
Enjambeur (revision 185177634)
|
||||
Autonomie provinciale (revision 199376762)
|
||||
Empire ottoman (revision 199325394)
|
||||
Semis direct (revision 197965033)
|
||||
Crosse (sport) (revision 198259303)
|
||||
Endogé (revision 188797617)
|
||||
Rue des Hospitalières-Saint-Gervais (revision 188012948)
|
||||
Rue du Marché-des-Blancs-Manteaux (revision 197761214)
|
||||
Bulbe du vestibule (revision 198487397)
|
||||
Allemagne nazie (revision 199375588)
|
||||
Pioche (revision 199117113)
|
||||
Cœur (revision 199311128)
|
||||
Okanagan (revision 191726303)
|
||||
Stimulation sexuelle (revision 190390283)
|
||||
Tract (revision 192402962)
|
||||
IIe siècle av. J.-C. (revision 192067750)
|
||||
Mer du Labrador (revision 172314988)
|
||||
Époque d'Edo (revision 195213554)
|
||||
Chatt-el-Arab (revision 192696405)
|
||||
Route des Grands Crus (revision 198871159)
|
||||
Louis-Philippe Ier (revision 199381721)
|
||||
Internet Archive (revision 196621011)
|
||||
1824 (revision 199087297)
|
||||
Cabestro (revision 192846627)
|
||||
Immunologique (revision 192718418)
|
||||
Pierre-Jules Delespine (revision 150590520)
|
||||
Mulet (revision 199468728)
|
||||
Griffon (mythologie) (revision 199398127)
|
||||
Tukulti-Ninurta II (revision 198978146)
|
||||
Régiment du train français (revision 176508922)
|
||||
Reproduction du cheval (revision 196596865)
|
||||
Paillis (revision 195545586)
|
||||
Jeux olympiques d'hiver de 1988 (revision 196401973)
|
||||
Prix de l'Arc de Triomphe (revision 199113863)
|
||||
Mi-Carême au Carnaval de Paris (revision 198316382)
|
||||
Camillien Houde (revision 196170671)
|
||||
Rectum (revision 198256154)
|
||||
Appellation d'origine (revision 196356478)
|
||||
Éthique (revision 197675343)
|
||||
Monarchie bélizienne (revision 197578724)
|
||||
Japon (revision 199440474)
|
||||
Teppanyaki (revision 196061220)
|
||||
Grenier (revision 198635007)
|
||||
Taureau Osborne (revision 193322732)
|
||||
Viande de bœuf (revision 198200168)
|
||||
Élevage bovin (revision 198420240)
|
||||
Tubulidentata (revision 141739462)
|
||||
Arabes (revision 199107906)
|
||||
France (revision 199348136)
|
||||
Ve millénaire av. J.-C. (revision 197832201)
|
||||
Course de taureaux (revision 196352587)
|
||||
Zalduendo (élevage) (revision 180445009)
|
||||
Alimentation humaine (revision 197838990)
|
||||
Conseil fédéral (Suisse) (revision 199396637)
|
||||
Torrestrella (revision 170532693)
|
||||
Aqueduc (revision 199286061)
|
||||
Napoléon (revision 199381695)
|
||||
Binette (outil) (revision 189825683)
|
||||
Royaume-Uni (revision 199157233)
|
||||
Journal d'agriculture pratique (revision 198002826)
|
||||
Véhicule (transport physique) (revision 196078387)
|
||||
Char réclame (revision 191386926)
|
||||
Earthlings (revision 199150202)
|
||||
Jean-Jacques Glassner (revision 181386691)
|
||||
Adolf Portmann (revision 197210182)
|
||||
Pierre naturelle (revision 199467513)
|
||||
Centre équestre (revision 196870774)
|
||||
Sperme (revision 197533836)
|
||||
Site web (revision 199318333)
|
||||
Jerf el Ahmar (revision 199170363)
|
||||
28 février (revision 198228456)
|
||||
Batteuse (revision 196001423)
|
||||
Période d'Uruk (revision 199006250)
|
||||
Elias Canetti (revision 199075881)
|
||||
Argile (revision 199107823)
|
||||
Antoine Antignac (revision 185029059)
|
||||
Abattage d'un arbre (revision 197239365)
|
||||
Alexisonfire (revision 197203952)
|
||||
Riz (revision 199431799)
|
||||
Liste lexicale (revision 194757836)
|
||||
Colonne de la liberté (Québec) (revision 181281869)
|
||||
Éditions de Minuit (revision 194713174)
|
||||
Pedigree (revision 180077221)
|
||||
Drapeau de l'Espagne (revision 199373078)
|
||||
Autorité (sciences de l'information) (revision 199134298)
|
||||
Carnaval de Paris (revision 198965123)
|
||||
Jument (revision 192426506)
|
||||
Immunoglobuline G (revision 195943996)
|
||||
Chasse à la palombe (revision 194440835)
|
||||
Europe (revision 199248008)
|
||||
Dominique de Guzmán (revision 198851982)
|
||||
Droit commercial (revision 195666376)
|
||||
.fr (revision 193628283)
|
||||
Anthroponymie (revision 193904253)
|
||||
Tell el-Amarna (revision 197374713)
|
||||
Aviculture (revision 197913491)
|
||||
Universalisme (philosophie) (revision 199000629)
|
||||
Moissonneuse (revision 198209470)
|
||||
4 novembre (revision 199233694)
|
||||
Montréal (revision 198767676)
|
||||
Substantif (revision 194987031)
|
||||
Mars 1813 (revision 174524777)
|
||||
Homo sapiens (revision 199051865)
|
||||
Antonomase (revision 197771312)
|
||||
Censure en France (revision 198329267)
|
||||
Griffe de jardin (revision 190033885)
|
||||
Alexandre IV (pape) (revision 197191302)
|
||||
Poulain (revision 196365805)
|
||||
Liste du patrimoine mondial en Suisse (revision 194058035)
|
||||
1979 (revision 199084851)
|
||||
Pilosa (revision 188056955)
|
||||
Additif alimentaire (revision 196477352)
|
||||
Islero (revision 177295684)
|
||||
Œillet rouge sur le sable (revision 180973401)
|
||||
Mezzavia (revision 137003979)
|
||||
Corne (matière) (revision 196535097)
|
||||
1913 (revision 199086121)
|
||||
Suisse (revision 199273297)
|
||||
Halle aux Grains de Toulouse (revision 197773169)
|
||||
Paul Gaffarel (revision 195679573)
|
||||
Himiko (reine) (revision 199049471)
|
||||
Sylvain Gaudreault (revision 197631213)
|
||||
Académie des beaux-arts (France) (revision 199189321)
|
||||
Poulinière (revision 187497518)
|
||||
Cruauté (revision 198910746)
|
||||
Liste des quartiers administratifs de Paris (revision 196613235)
|
||||
Révolution française (revision 199249888)
|
||||
Michael Greger (revision 190001227)
|
||||
Produit phytosanitaire (revision 199128753)
|
||||
Chien finnois de Laponie (revision 184586704)
|
||||
1941 en littérature (revision 193870278)
|
||||
Iwashimizu Hachiman-gū (revision 193554753)
|
||||
Cultivateur (outil) (revision 194054028)
|
||||
Pakistan (revision 199314711)
|
||||
Noblesse (revision 198900738)
|
||||
Suave (corrida) (revision 178500444)
|
||||
Phorésie (revision 193292867)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2015-11-30 19:05:38.631196
|
||||
- Wikipedia parsing ended at: 2022-12-14 20:39:17.167123
|
||||
|
||||
58 characters appeared 2625348 times.
|
||||
60 characters appeared 3423485 times.
|
||||
|
||||
First 38 characters:
|
||||
[ 0] Char e: 14.297990209297968 %
|
||||
[ 1] Char s: 8.062245462315854 %
|
||||
[ 2] Char a: 8.006862328346566 %
|
||||
[ 3] Char n: 7.458401705221555 %
|
||||
[ 4] Char i: 7.3982572976992 %
|
||||
[ 5] Char r: 6.902246864034788 %
|
||||
[ 6] Char t: 6.851777364372266 %
|
||||
[ 7] Char l: 5.928699738091865 %
|
||||
[ 8] Char o: 5.30996271732357 %
|
||||
[ 9] Char u: 5.181065519694913 %
|
||||
[10] Char d: 4.153773137884959 %
|
||||
[11] Char c: 3.1908912647009084 %
|
||||
[12] Char m: 2.8650297027289335 %
|
||||
[13] Char p: 2.801228637117822 %
|
||||
[14] Char é: 2.4742624596815355 %
|
||||
[15] Char v: 1.2647847066369868 %
|
||||
[16] Char g: 1.2577761119668707 %
|
||||
[17] Char f: 1.1079293107047143 %
|
||||
[18] Char b: 1.030415777260767 %
|
||||
[19] Char h: 0.9089842565633204 %
|
||||
[20] Char q: 0.7969610124067362 %
|
||||
[21] Char x: 0.43415196766295366 %
|
||||
[22] Char è: 0.398613821862854 %
|
||||
[23] Char à: 0.38916745513356704 %
|
||||
[24] Char y: 0.3763310616344957 %
|
||||
[25] Char j: 0.31298707828447886 %
|
||||
[26] Char k: 0.20576319786938724 %
|
||||
[27] Char z: 0.11880329769615304 %
|
||||
[28] Char ê: 0.11221369509870692 %
|
||||
[29] Char ç: 0.07610419647223911 %
|
||||
[30] Char w: 0.06574366522076312 %
|
||||
[31] Char ô: 0.04845071967602009 %
|
||||
[32] Char â: 0.0448321517756884 %
|
||||
[33] Char œ: 0.03778546691714774 %
|
||||
[34] Char î: 0.03725220427920413 %
|
||||
[35] Char ï: 0.02704403378142631 %
|
||||
[36] Char û: 0.02285411305472646 %
|
||||
[37] Char ù: 0.02034016061870655 %
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 14.33974444170195 %
|
||||
[ 1] Char s: 7.990629431704827 %
|
||||
[ 2] Char a: 7.761126454475484 %
|
||||
[ 3] Char i: 7.4210052037616645 %
|
||||
[ 4] Char n: 7.372516602234273 %
|
||||
[ 5] Char t: 6.9224196980562205 %
|
||||
[ 6] Char r: 6.772163453323149 %
|
||||
[ 7] Char l: 5.904684845997572 %
|
||||
[ 8] Char u: 5.404697260247963 %
|
||||
[ 9] Char o: 5.389975419784226 %
|
||||
[10] Char d: 4.168763701316056 %
|
||||
[11] Char c: 3.3637652859586065 %
|
||||
[12] Char p: 2.967998983491968 %
|
||||
[13] Char m: 2.8252789189962857 %
|
||||
[14] Char é: 2.495118278596226 %
|
||||
[15] Char v: 1.2636830598060165 %
|
||||
[16] Char g: 1.192731967571057 %
|
||||
[17] Char f: 1.0616374834415807 %
|
||||
[18] Char b: 0.9595485302257787 %
|
||||
[19] Char h: 0.9245257391225608 %
|
||||
[20] Char q: 0.8608479371167101 %
|
||||
[21] Char x: 0.47942374510184793 %
|
||||
[22] Char è: 0.38802565222280805 %
|
||||
[23] Char à: 0.3858056921528793 %
|
||||
[24] Char y: 0.3632555714425505 %
|
||||
[25] Char j: 0.29528389930144283 %
|
||||
[26] Char ê: 0.13754989433282166 %
|
||||
[27] Char k: 0.1343075842306889 %
|
||||
[28] Char z: 0.10606151334093766 %
|
||||
[29] Char ç: 0.06426200202425306 %
|
||||
[30] Char w: 0.05123434161388176 %
|
||||
[31] Char ô: 0.04477893141053634 %
|
||||
[32] Char î: 0.0410108412918415 %
|
||||
[33] Char â: 0.04057269127803977 %
|
||||
[34] Char œ: 0.030203140951398942 %
|
||||
[35] Char ù: 0.020008850630278796 %
|
||||
[36] Char ï: 0.017117060539187406 %
|
||||
[37] Char û: 0.016065500506063264 %
|
||||
|
||||
The first 38 characters have an accumulated ratio of 0.9997798387109063.
|
||||
The first 38 characters have an accumulated ratio of 0.9997782960930166.
|
||||
The first 5 characters have an accumulated ratio of 0.448850221338782.
|
||||
All characters whose order is over 19 have an accumulated ratio of 0.03475814849488167.
|
||||
|
||||
1149 sequences found.
|
||||
1187 sequences found.
|
||||
|
||||
First 512 (typical positive ratio): 0.997044499777764
|
||||
Next 512 (512-1024): 3.8090188424544096e-07
|
||||
Rest: 5.974086801089403e-05
|
||||
First 450 (typical positive ratio): 0.9950352320661208
|
||||
Next 168 (618-450): 0.003966397970469049
|
||||
Rest: 0.000998369963410184
|
||||
|
||||
- Processing end: 2015-11-30 19:05:38.842420
|
||||
- Processing end: 2022-12-14 20:39:17.243824
|
||||
|
||||
240
script/BuildLangModelLogs/LangGeorgianModel.log
Normal file
240
script/BuildLangModelLogs/LangGeorgianModel.log
Normal file
@ -0,0 +1,240 @@
|
||||
= Logs of language model for Georgian (ka) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-20 12:39:32.203539
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
არნოლდ შონბერგი (revision 4450208)
|
||||
ნიკოლოზ საკვირველთმოქმედი (revision 4452640)
|
||||
ინგლისური (revision 4330414)
|
||||
აშშ (revision 4451590)
|
||||
პროკოპი კესარიელი (revision 4424699)
|
||||
მთვარის პიერო (revision 4429979)
|
||||
კომპოზიტორი (revision 4257818)
|
||||
ტეტრარქია (revision 4308810)
|
||||
ახალი ზელანდია (revision 4416034)
|
||||
ფილოსოფია (revision 4288867)
|
||||
კონსტანტინე I დიდი (revision 4375061)
|
||||
სული (revision 4451196)
|
||||
ჯონ კეიჯი (revision 4417221)
|
||||
ადოლფ ჰიტლერი (revision 4442491)
|
||||
იუდაიზმი (revision 4355617)
|
||||
15 მარტი (revision 4265284)
|
||||
350 (revision 3827298)
|
||||
ჟორჟ ბიზე (revision 4430611)
|
||||
ვატიკანის მუზეუმები (revision 4287947)
|
||||
რიხარდ ვაგნერი (revision 4435391)
|
||||
ნაცისტური პარტია (revision 4387148)
|
||||
ნიკეის პირველი საეკლესიო კრება (revision 4393312)
|
||||
III საუკუნე (revision 3158968)
|
||||
წმინდა მიწა (revision 4053366)
|
||||
1991 (revision 4405448)
|
||||
ჯაზი (revision 4433861)
|
||||
იოანე მალალა (revision 3928608)
|
||||
1975 (revision 4167940)
|
||||
ლაიბნიცი, გოტფრიდ ვილჰელმ (revision 4406460)
|
||||
XIII საუკუნე (revision 3158955)
|
||||
საბუნებისმეტყველო მეცნიერებები (revision 784852)
|
||||
კომპიუტერი (revision 4291207)
|
||||
იოანე II (ბიზანტია) (revision 4245444)
|
||||
ოსიანი (revision 4275863)
|
||||
რომის კურია (revision 4093740)
|
||||
ანგლო-საქსური პერიოდი (revision 4212229)
|
||||
პოეტი (revision 4273738)
|
||||
ევროკავშირის ენები (revision 4137186)
|
||||
ბიორკი (revision 4422616)
|
||||
სვასტიკა (revision 4104250)
|
||||
3 ივნისი (revision 4314533)
|
||||
1933 (revision 4275988)
|
||||
1925 (revision 4054131)
|
||||
Wayback Machine (revision 4393565)
|
||||
ჰიტლერი (revision 4442491)
|
||||
რეპი (revision 4164120)
|
||||
რელიგია (revision 4445299)
|
||||
ჟერარ დე ნერვალი (revision 4430603)
|
||||
ფრანგული ენა (revision 4329255)
|
||||
პაპის სახელმწიფო საქმეების ვატიკანის კომისია (revision 4256091)
|
||||
აშშ-ის ვიცე-პრეზიდენტი (revision 4337709)
|
||||
ვიკისაწყობი (revision 4350741)
|
||||
მიხეილ ლერმონტოვი (revision 4098634)
|
||||
სეკულარიზმი (revision 4387207)
|
||||
რუჯერო ლეონკავალო (revision 4193014)
|
||||
საბჭოთა კავშირი (revision 4378005)
|
||||
რუდოლფ ჰესი (revision 4345664)
|
||||
კომპოზიტორები (revision 4314363)
|
||||
სს (revision 4167932)
|
||||
რუსეთი (revision 4452792)
|
||||
მუსიკა (revision 4135531)
|
||||
იუსტინიანე II (ბიზანტია) (revision 3597768)
|
||||
ანგლიკანიზმი (revision 4401729)
|
||||
ვიოლინო (revision 4146868)
|
||||
პარსიფალი (revision 4435526)
|
||||
291 (revision 3826103)
|
||||
ქრისტიანობა (revision 4446624)
|
||||
ლიბერალიზმი (revision 4383197)
|
||||
22 მარტი (revision 3887060)
|
||||
27 თებერვალი (revision 4313421)
|
||||
ბლუზი (revision 4109362)
|
||||
ლიბანი (revision 4331748)
|
||||
205 (revision 3050594)
|
||||
რომის კათოლიკური ეკლესია (revision 4175965)
|
||||
ეფესოს საეკლესიო კრება (revision 4042351)
|
||||
რიხარდ ფონ ვაიცზეკერი (revision 4331853)
|
||||
325 (revision 3827050)
|
||||
ტრევორ ჰოვარდი (revision 3560113)
|
||||
1895 (revision 4276008)
|
||||
პალესტინის სახელმწიფო (revision 4357268)
|
||||
პილიგრიმი (revision 4352139)
|
||||
გერმანული ენა (revision 4418082)
|
||||
ბჰუტანი (revision 4380636)
|
||||
კინიკოსები (revision 2833240)
|
||||
ევაგრიოს სქოლასტიკოსი (revision 3929102)
|
||||
ჯონ კიტსი (revision 4107589)
|
||||
ქალი (revision 4372485)
|
||||
სიქსტეს კაპელა (revision 4389181)
|
||||
მიხეილ VIII (ბიზანტია) (revision 4021585)
|
||||
2 იანვარი (revision 4451458)
|
||||
201 (revision 3824700)
|
||||
ვიკიციტატა (revision 4393663)
|
||||
სირია (revision 4331625)
|
||||
კონსტანტინე IV (ბიზანტია) (revision 4302882)
|
||||
ჯვაროსნული ლაშქრობები (revision 4395332)
|
||||
ბგერა (revision 4436502)
|
||||
DMOZ (revision 4386077)
|
||||
ივნისი (revision 3753237)
|
||||
ღირებულება (revision 4250301)
|
||||
კატეგორია (ფილოსოფია) (revision 2381896)
|
||||
ავიცენა (revision 4327548)
|
||||
MusicBrainz (revision 4411515)
|
||||
იდეალიზმი (revision 4245343)
|
||||
210 (revision 3050588)
|
||||
ბიზანტია (revision 4440485)
|
||||
258 (revision 3825790)
|
||||
20 მაისი (revision 4434926)
|
||||
ძვ. წ. 44 (revision 2356607)
|
||||
ალექსანდრია (revision 4427155)
|
||||
ინდუიზმი (revision 4448864)
|
||||
კოლორადოს შტატი (revision 3351421)
|
||||
კონსტანტინოპოლის მეორე საეკლესიო კრება (revision 4374923)
|
||||
მარტინ ბუბერი (revision 4440267)
|
||||
მიუზიკლი (revision 4356140)
|
||||
დიდი ბრიტანეთი (revision 4438930)
|
||||
ფელიქს მენდელსონი (revision 4108745)
|
||||
მოსახლეობა (revision 2789480)
|
||||
ISSN (revision 3500238)
|
||||
ქვეყნების სია (revision 4448427)
|
||||
ებრაული ენა (revision 4210619)
|
||||
ბელიზი (revision 4430794)
|
||||
რენი ჰარლინი (revision 3743470)
|
||||
1952 (revision 4278487)
|
||||
ძველი საბერძნეთი (revision 4446035)
|
||||
ფილოსოფოსი (revision 4288867)
|
||||
თორმეტი ტაბულის კანონები (revision 4310428)
|
||||
ისააკ I კომნენოსი (revision 4016717)
|
||||
სუბიექტი (revision 4137093)
|
||||
მესამე რაიხი (revision 4431825)
|
||||
281 (revision 3050510)
|
||||
დასავლეთ რომის იმპერია (revision 4418326)
|
||||
კლასიკური მუსიკა (revision 4448910)
|
||||
კავთისხევი (revision 4353780)
|
||||
2007 (revision 4441027)
|
||||
ნეოპლატონიზმი (revision 4336053)
|
||||
236 (revision 3825656)
|
||||
ფუგა (revision 3218315)
|
||||
პალესტინა (revision 4240018)
|
||||
მეორე მსოფლიო ომი (revision 4442511)
|
||||
დავიდ ბენ-გურიონი (revision 4428059)
|
||||
1948 (revision 4278428)
|
||||
ზაქარია რიტორი (revision 4021268)
|
||||
პერლისი (revision 4308212)
|
||||
211 (revision 3824746)
|
||||
ადამ მიცკევიჩი (revision 4261723)
|
||||
პეტრე ჩაიკოვსკი (revision 4441450)
|
||||
ქორონიკონი (revision 4451019)
|
||||
ებრ. (revision 4210619)
|
||||
ისლამი (revision 4302636)
|
||||
260 (revision 3991034)
|
||||
VII საუკუნე (revision 3938533)
|
||||
იოანე ანტიოქიელი (ისტორიკოსი) (revision 3657193)
|
||||
სახელმწიფო რელიგია (revision 4440560)
|
||||
არიანელობა (revision 4081875)
|
||||
ოტო მაისნერი (revision 3459961)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-20 12:56:27.858987
|
||||
|
||||
77 characters appeared 1115054 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char ა: 12.857762942422518 %
|
||||
[ 1] Char ი: 12.247478597449092 %
|
||||
[ 2] Char ე: 8.665768653356698 %
|
||||
[ 3] Char ს: 6.575466300286801 %
|
||||
[ 4] Char რ: 6.0818579189886774 %
|
||||
[ 5] Char ო: 5.1301551315003575 %
|
||||
[ 6] Char მ: 4.846133012392225 %
|
||||
[ 7] Char ლ: 4.529556416101821 %
|
||||
[ 8] Char ნ: 4.125181381350141 %
|
||||
[ 9] Char დ: 3.9241148859158392 %
|
||||
[10] Char ბ: 3.471311703289706 %
|
||||
[11] Char ვ: 2.726504725331688 %
|
||||
[12] Char უ: 2.657001364956316 %
|
||||
[13] Char თ: 2.120973513390383 %
|
||||
[14] Char გ: 1.9234046064136805 %
|
||||
[15] Char ტ: 1.9089658438066675 %
|
||||
[16] Char კ: 1.5684442188450065 %
|
||||
[17] Char შ: 1.4508714376164742 %
|
||||
[18] Char ხ: 1.2111521056379333 %
|
||||
[19] Char ც: 1.1790460372322775 %
|
||||
[20] Char პ: 0.9052476382309737 %
|
||||
[21] Char წ: 0.8995976876456208 %
|
||||
[22] Char ზ: 0.8015755290775155 %
|
||||
[23] Char ქ: 0.7613981026927844 %
|
||||
[24] Char ფ: 0.731354714659559 %
|
||||
[25] Char ყ: 0.57871636709971 %
|
||||
[26] Char i: 0.47576171198883643 %
|
||||
[27] Char e: 0.46858717156299157 %
|
||||
[28] Char ღ: 0.41737888927352396 %
|
||||
[29] Char a: 0.3600722476220882 %
|
||||
[30] Char ძ: 0.3447366674618449 %
|
||||
[31] Char n: 0.334512947355016 %
|
||||
[32] Char o: 0.3023171971940372 %
|
||||
[33] Char s: 0.2952323385235155 %
|
||||
[34] Char r: 0.2890442974062243 %
|
||||
[35] Char t: 0.27639916990567276 %
|
||||
[36] Char ჩ: 0.2525438229897386 %
|
||||
[37] Char ჰ: 0.21810602894568334 %
|
||||
[38] Char l: 0.21039339798790013 %
|
||||
[39] Char ჯ: 0.1890491402210117 %
|
||||
[40] Char h: 0.18286109910372056 %
|
||||
[41] Char c: 0.17218897022027632 %
|
||||
[42] Char d: 0.16752551894347717 %
|
||||
[43] Char u: 0.13550913229314454 %
|
||||
[44] Char m: 0.12743777431406908 %
|
||||
[45] Char b: 0.10340306388748885 %
|
||||
[46] Char p: 0.10017452069585868 %
|
||||
[47] Char g: 0.09282061675936772 %
|
||||
[48] Char ჭ: 0.09048889112096814 %
|
||||
[49] Char y: 0.08752939319530713 %
|
||||
[50] Char v: 0.07524299271604784 %
|
||||
[51] Char f: 0.06887558808811053 %
|
||||
[52] Char w: 0.0669025894710032 %
|
||||
[53] Char x: 0.056051097076912866 %
|
||||
[54] Char k: 0.05273287212995962 %
|
||||
[55] Char ჟ: 0.04735196681057599 %
|
||||
|
||||
The first 56 characters have an accumulated ratio of 0.9994027195095485.
|
||||
The first 4 characters have an accumulated ratio of 0.4034647649351511.
|
||||
All characters whose order is over 33 have an accumulated ratio of 0.03062631944282519.
|
||||
|
||||
1485 sequences found.
|
||||
|
||||
First 819 (typical positive ratio): 0.9950126614517769
|
||||
Next 240 (1059-819): 0.003988409500368384
|
||||
Rest: 0.000998929047854702
|
||||
|
||||
- Processing end: 2022-12-20 12:56:28.396075
|
||||
255
script/BuildLangModelLogs/LangGermanModel.log
Normal file
255
script/BuildLangModelLogs/LangGermanModel.log
Normal file
@ -0,0 +1,255 @@
|
||||
= Logs of language model for German (de) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 23:56:29.651754
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Deutschland (revision 228606777)
|
||||
Ungarneinfälle (revision 228768784)
|
||||
Äthiopien (revision 228777266)
|
||||
Italienische Sprache (revision 228106913)
|
||||
Stadtstaat (revision 228606500)
|
||||
Mais (revision 228729828)
|
||||
Peter Walther (revision 214205285)
|
||||
Elektrizitätsversorgung (revision 227608161)
|
||||
Bundesagentur für Arbeit (revision 228474893)
|
||||
Anton von Werner (revision 228255869)
|
||||
Richard Dedekind (revision 227920744)
|
||||
Eisschild (revision 227400370)
|
||||
Jemen (revision 228753034)
|
||||
Berufsbild (revision 222871013)
|
||||
Leonhard Euler (revision 228416765)
|
||||
Gelände (revision 228649514)
|
||||
Michael Hamburger (revision 226393373)
|
||||
Provinz Schleswig-Holstein (revision 223299120)
|
||||
Primärenergie (revision 228433024)
|
||||
Organisation für eine solidarische Welt (revision 208279359)
|
||||
Gefrierpunktserniedrigung (revision 224283903)
|
||||
Wolf Biermann (revision 228717194)
|
||||
Gemeinsame Normdatei (revision 228265239)
|
||||
Allgemeine Erklärung der Menschenrechte (revision 228825596)
|
||||
Bundesagentur (revision 220221314)
|
||||
Früchte (revision 225710699)
|
||||
Kuwaitische Streitkräfte (revision 225186439)
|
||||
Afrika (revision 228805304)
|
||||
Mesopotamien (revision 226452374)
|
||||
Deutsch-Französischer Krieg (revision 228373530)
|
||||
James E. Hansen (revision 228130584)
|
||||
Nunatak (revision 226004806)
|
||||
1965 (revision 228809670)
|
||||
Emilianische Sprache (revision 228018609)
|
||||
Internet Archive (revision 228865464)
|
||||
Bangladesch (revision 228807193)
|
||||
Sklavenhandel (revision 228087869)
|
||||
Braunschweig (revision 228220012)
|
||||
Ostfrankenreich (revision 228685819)
|
||||
Medina von Zabid (revision 224779101)
|
||||
Anschlag auf die Nord-Stream-Pipelines (revision 228851766)
|
||||
Bernd Rill (revision 215827055)
|
||||
Kubikkilometer (revision 224503101)
|
||||
Martina Kaller-Dietrich (revision 208159206)
|
||||
Normaldruck (revision 220716910)
|
||||
Deutsche 3. Armee (Deutsch-Französischer Krieg) (revision 228764651)
|
||||
Endemit (revision 225834805)
|
||||
Ferdinand Opll (revision 227609827)
|
||||
Süditalienisch (revision 183453109)
|
||||
Angola (revision 228787863)
|
||||
Brasilien (revision 228663551)
|
||||
Burgenordnung (revision 228769701)
|
||||
Lehrstuhl (revision 227634509)
|
||||
Brandenburg (revision 228781446)
|
||||
Kayser & von Großheim (revision 220121048)
|
||||
Leistungsmissbrauch (revision 195597228)
|
||||
Jean Cavaillès (revision 202181070)
|
||||
Kangerlussuaq (revision 224574132)
|
||||
Rimini (revision 227046826)
|
||||
Jimma (revision 206733769)
|
||||
Académie des sciences (revision 227860750)
|
||||
Ibrahim al-Hamdi (revision 211147927)
|
||||
Nilotische Sprachen (revision 198831990)
|
||||
Abdallah ibn Husain al-Ahmar (revision 211150675)
|
||||
Modulares Gesetz von Dedekind (revision 212467071)
|
||||
Frank-Jürgen Weise (revision 228738039)
|
||||
Algebraischer Zahlkörper (revision 207023222)
|
||||
Zeitzone (revision 228207191)
|
||||
Rio de Janeiro (revision 228611675)
|
||||
Schlacht bei Riade (revision 228766987)
|
||||
Fajsz (revision 215292800)
|
||||
Basel (revision 228844409)
|
||||
Römischer Dialekt (revision 224726203)
|
||||
Eritrea (revision 228647055)
|
||||
Schwaben (revision 226441292)
|
||||
27. Dezember (revision 228505321)
|
||||
Kursan (revision 221194924)
|
||||
Gebiet (revision 227478187)
|
||||
Max Liebermann (revision 228825219)
|
||||
Marcel Reich-Ranicki (revision 227818012)
|
||||
6. Oktober (revision 228177719)
|
||||
Vereinigtes Königreich (revision 228830889)
|
||||
Freie Hansestadt Bremen (revision 228393052)
|
||||
Wilhelminer (revision 184639586)
|
||||
Sorghumhirsen (revision 227421041)
|
||||
Waadt (revision 228637190)
|
||||
Spanien (revision 228606765)
|
||||
Nigeria (revision 228860955)
|
||||
Theodor Fontane (revision 228446109)
|
||||
August zu Eulenburg (revision 221742465)
|
||||
Wirtschaftswachstum (revision 228334614)
|
||||
Sendeanlage (revision 223637854)
|
||||
Bergpredigt (revision 228386490)
|
||||
Emmy Noether (revision 228699911)
|
||||
Europa (revision 228865802)
|
||||
Arbeitgeber (revision 203029719)
|
||||
Kraftwerkseinsatzoptimierung (revision 217324853)
|
||||
Alte Nationalgalerie (revision 228284855)
|
||||
Dr. phil. (revision 228312094)
|
||||
Hans Mayer (Literaturwissenschaftler) (revision 226080621)
|
||||
Strukturanpassungsmaßnahme (revision 154677279)
|
||||
Neue Deutsche Biographie (revision 228241293)
|
||||
Kernkraftwerk (revision 228305057)
|
||||
Mechthild Schulze-Dörrlamm (revision 226743261)
|
||||
Günter de Bruyn (revision 228690339)
|
||||
Aostatal (revision 226405922)
|
||||
Quadratkilometer (revision 223099015)
|
||||
Oberflächenwasser (revision 228810839)
|
||||
Süßgräser (revision 225517154)
|
||||
Florenz (revision 228438880)
|
||||
Menelik I. (revision 220657570)
|
||||
American Sign Language (revision 225955248)
|
||||
Handelssprache (revision 223923299)
|
||||
Kyrillisches Alphabet (revision 226270962)
|
||||
Adolf Rosenberg (revision 217941753)
|
||||
Georg Cantor (revision 227767731)
|
||||
Akademie der Künste (Berlin) (revision 228718084)
|
||||
Eiszeitalter (revision 228423862)
|
||||
Hybride (revision 224939585)
|
||||
Ideal (Ringtheorie) (revision 227062102)
|
||||
Malawi (revision 227979262)
|
||||
Realismus (Kunst) (revision 228665978)
|
||||
Elektromagnet (revision 215856084)
|
||||
Bedingungsloses Grundeinkommen (revision 228319519)
|
||||
CryoSat-2 (revision 222991606)
|
||||
Steinkohlenbergbau (revision 227136570)
|
||||
Flughafen Sai'ūn (revision 218085552)
|
||||
Kanada (revision 228843319)
|
||||
Tätigkeitsschlüssel (revision 206985116)
|
||||
Arnulf I. (Bayern) (revision 219794310)
|
||||
Äthiopischer Birr (revision 218043169)
|
||||
Dachsprache (revision 217492262)
|
||||
Regierungsbezirk (revision 225453571)
|
||||
Altgriechische Sprache (revision 226388919)
|
||||
Heiliges Römisches Reich (revision 228549579)
|
||||
Mangan (revision 227356461)
|
||||
Tomate (revision 228788761)
|
||||
Liste der Außenminister Äthiopiens (revision 222986689)
|
||||
David Archer (revision 208114022)
|
||||
Schlacht von Pressburg (revision 218874493)
|
||||
Eswatini (revision 227460982)
|
||||
Explorix (revision 222971601)
|
||||
Verbraucher (revision 224557437)
|
||||
Umspannwerk (revision 222823940)
|
||||
Eugène Delacroix (revision 228537433)
|
||||
Statistische Systematik der Wirtschaftszweige in der Europäischen Gemeinschaft (revision 225614083)
|
||||
Gemeinde (Deutschland) (revision 228003605)
|
||||
Klimaschutz (revision 228365985)
|
||||
Energiewandler (revision 226533635)
|
||||
Günter Eich (revision 227865826)
|
||||
Arik Brauer (revision 228858852)
|
||||
Olympus (Satellit) (revision 222991938)
|
||||
Hochspannungs-Gleichstrom-Übertragung (revision 227996489)
|
||||
AS Roma (revision 227513080)
|
||||
Ali Abdullah Salih (revision 228737840)
|
||||
Pirelli-Hochhaus (revision 225174330)
|
||||
Beschäftigung (revision 220327973)
|
||||
Windenergie (revision 228833811)
|
||||
Carnot-Kreisprozess (revision 224094562)
|
||||
Umweltverschmutzung (revision 223785003)
|
||||
Camille Le Tellier de Louvois (revision 224753378)
|
||||
Salzburggau (revision 225169888)
|
||||
Karl Friedrich Schimper (revision 225862938)
|
||||
Staatsreligion (revision 226365106)
|
||||
Kilogramm (revision 227055964)
|
||||
Kulturhistorisches Museum Magdeburg (revision 228122358)
|
||||
Wasserversorgung (revision 219277006)
|
||||
Energiewirtschaft (revision 224577593)
|
||||
Gebrauchssprache (revision 222726670)
|
||||
Roggenbrot (revision 223590572)
|
||||
Mekka (revision 228406949)
|
||||
Kultusministerium (revision 228012512)
|
||||
Indiana University Bloomington (revision 225883504)
|
||||
Kyros-Zylinder (revision 228414898)
|
||||
Regeln für die alphabetische Katalogisierung (revision 213780375)
|
||||
Thailändische Streitkräfte (revision 220673813)
|
||||
Deutsche Nationalstiftung (revision 219900629)
|
||||
Persistenz (Informatik) (revision 211098342)
|
||||
Altewiek (revision 217193242)
|
||||
Distributivgesetz (revision 227632200)
|
||||
Örterbau (revision 214830939)
|
||||
Webanwendung (revision 228408636)
|
||||
ISO 3166-2 (revision 224408291)
|
||||
Magneteisen (revision 226103474)
|
||||
Wilhelm Hauff (revision 224019493)
|
||||
Reichsfreiheit (revision 223024255)
|
||||
Mödling (revision 228813084)
|
||||
N-tv (revision 228248458)
|
||||
Island (revision 228455598)
|
||||
Deutsches Archäologisches Institut (revision 228802778)
|
||||
Fajsz (Ort) (revision 203896222)
|
||||
Innu (revision 219609498)
|
||||
AEG (revision 228810033)
|
||||
Abgabenordnung (revision 228397042)
|
||||
Gentechnik (revision 228859989)
|
||||
Bajuwaren (revision 224140177)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 23:59:59.485394
|
||||
|
||||
61 characters appeared 3973938 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 15.586378046159755 %
|
||||
[ 1] Char n: 9.613788639883158 %
|
||||
[ 2] Char i: 8.167163151513687 %
|
||||
[ 3] Char r: 7.78381544956162 %
|
||||
[ 4] Char s: 6.370834170034863 %
|
||||
[ 5] Char t: 6.198536564989187 %
|
||||
[ 6] Char a: 6.0610407107508975 %
|
||||
[ 7] Char d: 5.036314104548183 %
|
||||
[ 8] Char h: 4.10476962650147 %
|
||||
[ 9] Char u: 3.901092568630915 %
|
||||
[10] Char l: 3.8286707039717283 %
|
||||
[11] Char g: 3.025563056091967 %
|
||||
[12] Char o: 2.9347715037325695 %
|
||||
[13] Char c: 2.801981309220224 %
|
||||
[14] Char m: 2.6123709026159943 %
|
||||
[15] Char b: 2.136419843490261 %
|
||||
[16] Char f: 1.5972065995996918 %
|
||||
[17] Char k: 1.4957958579122272 %
|
||||
[18] Char w: 1.3413646614516885 %
|
||||
[19] Char z: 1.1560572912813436 %
|
||||
[20] Char p: 1.0669013960459373 %
|
||||
[21] Char v: 0.9756568924829728 %
|
||||
[22] Char ä: 0.569409990795025 %
|
||||
[23] Char ü: 0.531311761783903 %
|
||||
[24] Char ö: 0.30601886592090766 %
|
||||
[25] Char j: 0.2797728600697847 %
|
||||
[26] Char y: 0.18633909235624713 %
|
||||
[27] Char ß: 0.15126053803557077 %
|
||||
[28] Char x: 0.07903998502241354 %
|
||||
[29] Char q: 0.03792208132084597 %
|
||||
|
||||
The first 30 characters have an accumulated ratio of 0.9993756822577502.
|
||||
The first 4 characters have an accumulated ratio of 0.41151145287118224.
|
||||
All characters whose order is over 20 have an accumulated ratio of 0.031167320677876705.
|
||||
|
||||
1313 sequences found.
|
||||
|
||||
First 511 (typical positive ratio): 0.9950430213396004
|
||||
Next 186 (697-511): 0.0039634314878256305
|
||||
Rest: 0.0009935471725739387
|
||||
|
||||
- Processing end: 2022-12-14 23:59:59.567695
|
||||
231
script/BuildLangModelLogs/LangGreekModel.log
Normal file
231
script/BuildLangModelLogs/LangGreekModel.log
Normal file
@ -0,0 +1,231 @@
|
||||
= Logs of language model for Greek (el) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-18 20:25:01.002309
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Πρωτεύοντα (revision 9792164)
|
||||
Ευαρχοντομυωξοί (revision 9475530)
|
||||
Φολιδωτά (θηλαστικά) (revision 8966182)
|
||||
Ανθρώπινη εξέλιξη (revision 9731824)
|
||||
Υδατάνθρακες (revision 9276169)
|
||||
Άνθρωπος (revision 9804050)
|
||||
National Library of the Czech Republic (revision 9499518)
|
||||
Ταξινομία (revision 6174527)
|
||||
Δεοξυριβόζη (revision 9735675)
|
||||
Συστηματική ταξινόμηση (revision 9163863)
|
||||
Οικογένεια (βιολογία) (revision 8380547)
|
||||
Μονοσακχαρίτης (revision 8520367)
|
||||
Ευλιπότυφλα (revision 8635098)
|
||||
Γαλάγος (revision 9624211)
|
||||
Ανθρωποειδή (revision 9802784)
|
||||
Μυρμήγκι (revision 9743672)
|
||||
Primates (revision 9792164)
|
||||
Εθνική Βιβλιοθήκη της Μποτσουάνα (revision 9771961)
|
||||
Εθνική Βιβλιοθήκη της Σλοβακίας (revision 9545464)
|
||||
Κίνα (revision 9794230)
|
||||
Μονοσακχαρίτες (revision 8520367)
|
||||
Άνθρακας (revision 9698608)
|
||||
Τερμίτης (revision 8570600)
|
||||
Virtual International Authority File (revision 9547787)
|
||||
Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547)
|
||||
International Union for Conservation of Nature (revision 9555075)
|
||||
Neogene (revision 7970278)
|
||||
Ανθρωπoειδή (revision 9802784)
|
||||
Λάρυγγας (revision 8037233)
|
||||
Θηλαστικά (revision 9802762)
|
||||
IUCN Red List (revision 9104016)
|
||||
Δισακχαρίτης (revision 9301054)
|
||||
Ινσουλίνη (revision 9193560)
|
||||
Αρχαϊκοί Homo sapiens (revision 9496339)
|
||||
Εθνική Βιβλιοθήκη της Μοζαμβίκης (revision 9771960)
|
||||
Εθνική Βιβλιοθήκη της Πολωνίας (revision 9771967)
|
||||
Ολιγοσακχαρίτης (revision 9784937)
|
||||
Θεσμός (revision 9409922)
|
||||
Μοριακό βάρος (revision 8588261)
|
||||
Παράνθρωποι (revision 9187211)
|
||||
Χρονολόγιο της ανθρώπινης εξέλιξης (revision 9494488)
|
||||
Κοινός πρόγονος (revision 7955205)
|
||||
Ασία (revision 9640488)
|
||||
Εθνική Βιβλιοθήκη του Βανουάτου (revision 9510031)
|
||||
Συνομοταξία (revision 8090691)
|
||||
Διαδίκτυο (revision 9818610)
|
||||
Τριγλυκερίδιο (revision 8991916)
|
||||
Εθνική Βιβλιοθήκη της Λετονίας (revision 9736743)
|
||||
Εθνική βιβλιοθήκη της Σουηδίας (revision 9741133)
|
||||
Ζώα (revision 9797988)
|
||||
Απειλούμενα είδη (revision 9387012)
|
||||
Εθνική Βιβλιοθήκη της Μιανμάρ (revision 9771959)
|
||||
Silurian (revision 7083264)
|
||||
Γερμανική γλώσσα (revision 9768836)
|
||||
Ζωολογία (revision 9597532)
|
||||
Σπονδυλωτά (revision 8936763)
|
||||
Χορδωτά (revision 9800855)
|
||||
Εθνική Βιβλιοθήκη του Ελ Σαλβαδόρ (revision 9608126)
|
||||
Μακρομόρια (revision 8962637)
|
||||
Homo sapiens (revision 9804050)
|
||||
Γλυκίδια (revision 8976376)
|
||||
Κατάλογος καθιερωμένων όρων (revision 9747802)
|
||||
Paleogene (revision 7772183)
|
||||
Γένος (βιολογία) (revision 8620951)
|
||||
Βραδυποδόμορφα (revision 8793874)
|
||||
Εθνική Βιβλιοθήκη της Ουκρανίας (revision 9818749)
|
||||
Περίοδος (γεωλογία) (revision 9598229)
|
||||
Γραμμομόριο (revision 9175982)
|
||||
Νουκλεϊκά οξέα (revision 9020237)
|
||||
Γάλα (revision 9473543)
|
||||
Μετάλλαξη (revision 9662655)
|
||||
Γαλακτόζη (revision 8983758)
|
||||
Φάλαινα (revision 9455804)
|
||||
Εθνική Βιβλιοθήκη της Ισπανίας (revision 9771953)
|
||||
Γλυκογόνο (revision 8033277)
|
||||
Ισπανική γλώσσα (revision 9751022)
|
||||
Φωνητικές χορδές (revision 9179304)
|
||||
Κανονικές συνθήκες (revision 9776846)
|
||||
Άλπεις (revision 9759633)
|
||||
Αντίδραση συμπύκνωσης (revision 8965637)
|
||||
National Diet Library (revision 9533181)
|
||||
Εθνική Βιβλιοθήκη της Βραζιλίας (revision 9516238)
|
||||
Homo sapiens sapiens (revision 9804050)
|
||||
Εθνική Βιβλιοθήκη των Μπαρμπάντος (revision 9608141)
|
||||
Μουντάνεουμ (revision 9387431)
|
||||
Ζώο (revision 9797988)
|
||||
Εθνική και Πανεπιστημιακή Βιβλιοθήκη της Ισλανδίας (revision 9510045)
|
||||
Μόριο (revision 9737689)
|
||||
Εθνική Βιβλιοθήκη της Ανδόρας (revision 9771949)
|
||||
Βασίλειο (βιολογία) (revision 9171746)
|
||||
Εθνική και Πανεπιστημιακή Βιβλιοθήκη «Άγιος Κλήμης της Αχρίδας» (revision 9608210)
|
||||
Κλίμα (revision 9262599)
|
||||
Δακτυλιοσκώληκες (revision 8985128)
|
||||
Ασπάλακας (revision 9429446)
|
||||
Μόλυνση (revision 8512424)
|
||||
International Standard Name Identifier (revision 6861942)
|
||||
Υδροξυλομάδα (revision 9719647)
|
||||
Εθνική Βιβλιοθήκη του Κουβέιτ (revision 9511761)
|
||||
Homo rhodesiensis (revision 7605622)
|
||||
Αγγλική γλώσσα (revision 9779698)
|
||||
Περιβαλλοντική εκπαίδευση (revision 7971138)
|
||||
Γουανίνη (revision 8392293)
|
||||
Γριβάδι (revision 9370003)
|
||||
Διεθνής Επιτροπή Στρωματογραφίας (revision 9796210)
|
||||
Εχινόδερμα (revision 9101031)
|
||||
Εθνική Βιβλιοθήκη των Φιλιππινών (revision 9511751)
|
||||
Αρτίγονος (revision 9753577)
|
||||
Εθνική Βιβλιοθήκη της Σρι Λάνκα (revision 9511705)
|
||||
Περιβαλλοντικά προβλήματα (revision 9555971)
|
||||
Υπερτάξη (revision 7554395)
|
||||
Κατάλογος αντιστοιχίας Λατινικών-Ελληνικών όρων ταξινομικών μονάδων (revision 9562399)
|
||||
Κόκκινος κατάλογος της IUCN (revision 9104016)
|
||||
Κοινοβουλευτική Βιβλιοθήκη της Γεωργίας (revision 9508234)
|
||||
Ασπόνδυλα (revision 9049085)
|
||||
Τάξη (βιολογία) (revision 7554395)
|
||||
Γρυλοβλαττοειδή (revision 6401187)
|
||||
Γλυκόζη (revision 9770284)
|
||||
Τουρκικές γλώσσες (revision 9284882)
|
||||
Εκπνοή (revision 9611418)
|
||||
Ανθρωπίνοι (revision 9103976)
|
||||
Εθνική Βιβλιοθήκη του Μαυρικίου (revision 9736776)
|
||||
Σαρκοφάγα (revision 8222140)
|
||||
Χημική ένωση (revision 9478321)
|
||||
Νουκλεοτίδια (revision 8520133)
|
||||
Πλειστόκαινο (revision 9225169)
|
||||
Υποοικογένεια (revision 8380547)
|
||||
Πόδι (έντομα) (revision 7865328)
|
||||
Δημόσια Βιβλιοθήκη Τσαρλς Α. Χάλμπερτ (revision 9607718)
|
||||
Δισακχαρίτες (revision 9301054)
|
||||
Νορβηγική γλώσσα (revision 9527903)
|
||||
Σορβόζη (revision 9702780)
|
||||
Bibliothèque nationale de France (revision 9636186)
|
||||
1778 (revision 9509259)
|
||||
Αμυλοπηκτίνη (revision 7348804)
|
||||
Υφομοταξία (revision 9796614)
|
||||
Κοινή καταγωγή (revision 7955205)
|
||||
Βιβλιοθήκη του Βατικανού (revision 9791596)
|
||||
Κράμα (revision 8491814)
|
||||
Orrorin tugenensis (revision 8021796)
|
||||
Εθνική Βιβλιοθήκη της Γερμανίας (revision 9533197)
|
||||
Εθνική Βιβλιοθήκη της Ελλάδος (revision 9771951)
|
||||
Κάρολος Λινναίος (revision 9170651)
|
||||
Εθνική Βιβλιοθήκη της Μαυριτανίας (revision 9771958)
|
||||
Εθνική Βιβλιοθήκη της Σαουδικής Αραβίας (revision 9777111)
|
||||
Εθνική Βιβλιοθήκη της Ιορδανίας (revision 9510012)
|
||||
Κλαδιστική (revision 7593647)
|
||||
Κετόζες (revision 9015709)
|
||||
Υδροξύλιο (revision 9719647)
|
||||
Απειλούμενο είδος (revision 9387012)
|
||||
Νέφος (revision 9753949)
|
||||
Κατάρρινοι (revision 9802799)
|
||||
Επικοινωνία (revision 9810024)
|
||||
Χημικός τύπος (revision 9478340)
|
||||
Εθνικά Αρχεία και Βιβλιοθήκη της Αιθιοπίας (revision 9608078)
|
||||
Ολιγόκαινος εποχή (revision 8882927)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-18 20:30:49.244663
|
||||
|
||||
62 characters appeared 918903 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char α: 9.042630179681641 %
|
||||
[ 1] Char ο: 7.761537398397872 %
|
||||
[ 2] Char τ: 7.389680956531865 %
|
||||
[ 3] Char ι: 7.071584269503963 %
|
||||
[ 4] Char ν: 6.1224090029089036 %
|
||||
[ 5] Char ε: 5.937188147171138 %
|
||||
[ 6] Char κ: 4.257359046602308 %
|
||||
[ 7] Char ρ: 4.217311294010358 %
|
||||
[ 8] Char σ: 4.050373107934135 %
|
||||
[ 9] Char η: 3.6424954538183028 %
|
||||
[10] Char π: 3.53171118170253 %
|
||||
[11] Char ς: 3.3343018795237365 %
|
||||
[12] Char μ: 3.2733596473185957 %
|
||||
[13] Char υ: 3.02023173283796 %
|
||||
[14] Char λ: 2.6589313561932 %
|
||||
[15] Char ί: 2.381426548830508 %
|
||||
[16] Char ό: 1.9545044471505697 %
|
||||
[17] Char ά: 1.8594998601593422 %
|
||||
[18] Char γ: 1.7558980654106038 %
|
||||
[19] Char δ: 1.6237840120230318 %
|
||||
[20] Char έ: 1.569806606355622 %
|
||||
[21] Char ω: 1.5474973963519545 %
|
||||
[22] Char ή: 1.323969994656672 %
|
||||
[23] Char χ: 1.1194870405254962 %
|
||||
[24] Char ύ: 1.0730185884690766 %
|
||||
[25] Char θ: 1.0217618181679675 %
|
||||
[26] Char ώ: 0.7902901612030867 %
|
||||
[27] Char φ: 0.7704839357364162 %
|
||||
[28] Char β: 0.7675456495408112 %
|
||||
[29] Char ξ: 0.4437900409510035 %
|
||||
[30] Char ζ: 0.4305133403634551 %
|
||||
[31] Char a: 0.4036334629444022 %
|
||||
[32] Char e: 0.39601568391875963 %
|
||||
[33] Char i: 0.3618445037180203 %
|
||||
[34] Char n: 0.3161378295641651 %
|
||||
[35] Char o: 0.31026125717295516 %
|
||||
[36] Char s: 0.2842519830711185 %
|
||||
[37] Char r: 0.2519308349194638 %
|
||||
[38] Char t: 0.23560702272165832 %
|
||||
[39] Char l: 0.20110936627696285 %
|
||||
[40] Char c: 0.19925933422787825 %
|
||||
[41] Char h: 0.1609527882703615 %
|
||||
[42] Char d: 0.14419367441394795 %
|
||||
[43] Char u: 0.13690237163226152 %
|
||||
[44] Char m: 0.1365758953883054 %
|
||||
[45] Char p: 0.11328725665276966 %
|
||||
[46] Char ψ: 0.10240471518756604 %
|
||||
|
||||
The first 47 characters have an accumulated ratio of 0.9949875014011275.
|
||||
The first 6 characters have an accumulated ratio of 0.4332502995419538.
|
||||
All characters whose order is over 31 have an accumulated ratio of 0.03350734517136193.
|
||||
|
||||
1389 sequences found.
|
||||
|
||||
First 849 (typical positive ratio): 0.9950207709120384
|
||||
Next 223 (1072-849): 0.003984435961508326
|
||||
Rest: 0.0009947931264532306
|
||||
|
||||
- Processing end: 2022-12-18 20:30:49.348223
|
||||
285
script/BuildLangModelLogs/LangHebrewModel.log
Normal file
285
script/BuildLangModelLogs/LangHebrewModel.log
Normal file
@ -0,0 +1,285 @@
|
||||
= Logs of language model for Hebrew (he) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 15:23:40.722736
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
יהדות_בוקרשט (revision 35182799)
|
||||
בית כלא (revision 35227881)
|
||||
יעקב אלמולי (revision 35001208)
|
||||
טודור ולדימירסקו (revision 29886791)
|
||||
בלקן (revision 33993972)
|
||||
גזירות ת"ח ת"ט (revision 34763839)
|
||||
סילביו ברוקאן (revision 29510407)
|
||||
1912 (revision 33159662)
|
||||
צבא (revision 34619941)
|
||||
שיטת ספריית הקונגרס (revision 30163525)
|
||||
מרסל ינקו (revision 34359400)
|
||||
פנקס הקהילות (revision 34615712)
|
||||
יעקב פסנתיר (revision 33120540)
|
||||
בוקרשט (revision 35173617)
|
||||
יהדות לוב (revision 34775645)
|
||||
מדרג (revision 34420008)
|
||||
קובץ בקרה משולב (revision 34980719)
|
||||
צבא קבע (revision 34044374)
|
||||
עברית (revision 35171043)
|
||||
2003 (revision 34884225)
|
||||
תומאס ג'פרסון (revision 35144907)
|
||||
אפגניסטן (revision 35015482)
|
||||
טנק (revision 34805353)
|
||||
21 באפריל (revision 34869840)
|
||||
רומנית (revision 35154129)
|
||||
מערב אירופה (revision 35029137)
|
||||
פינסק (revision 34882043)
|
||||
בית סוהר גבעון (revision 34787725)
|
||||
הקהילה היהודית הספרדית בבוקרשט (revision 32942838)
|
||||
קומוניזם (revision 34968105)
|
||||
אנרגיה (revision 35140939)
|
||||
ספריית הוותיקן (revision 32639141)
|
||||
לאו רומני (revision 34364476)
|
||||
תותח (revision 35035899)
|
||||
כ' בסיוון (revision 34741740)
|
||||
Union List of Artist Names (revision 34992334)
|
||||
בנימין גלאי (revision 33202928)
|
||||
חיל הנדסה (revision 33949573)
|
||||
25 באוגוסט (revision 34821302)
|
||||
אנה טיכו (revision 34831809)
|
||||
חייל (revision 35206828)
|
||||
הלאמה (revision 34453859)
|
||||
קלוויניסט (revision 34763753)
|
||||
רפואה (revision 35157098)
|
||||
תלמוד תורה (revision 35216490)
|
||||
23 בינואר (revision 35038971)
|
||||
מלחמת העולם הראשונה (revision 35191080)
|
||||
כוחות מילואים (revision 32714547)
|
||||
גרמנית (revision 35085309)
|
||||
אוטודידקט (revision 34614272)
|
||||
דיוויזיית מתנדבים 1, טודור ולדימירסקו (revision 28599203)
|
||||
זית (revision 35159584)
|
||||
יהדות רומניה (revision 34919407)
|
||||
צבי לוקר (revision 34639828)
|
||||
WorldCat (revision 34980710)
|
||||
ספרי יזכור (revision 34570622)
|
||||
דת (revision 35160267)
|
||||
גזירות תתנ"ו (revision 34939752)
|
||||
פרו (revision 35228614)
|
||||
הארץ (revision 35234211)
|
||||
הספרייה הלאומית (revision 35173909)
|
||||
בויאר (revision 34292683)
|
||||
נפוליאון בונפרטה (revision 35212132)
|
||||
העולם השלישי (revision 34866022)
|
||||
פנדורים (revision 22519224)
|
||||
קושטא (revision 34914083)
|
||||
תענית ציבור (revision 35122402)
|
||||
י"ט באייר (revision 33760934)
|
||||
דוד רובינגר (revision 34618241)
|
||||
1886 (revision 30398678)
|
||||
9 במרץ (revision 35039056)
|
||||
1855 (revision 34224046)
|
||||
מרד הלגיונרים ופרעות בוקרשט (revision 35067329)
|
||||
1966 (revision 34533574)
|
||||
יווני (revision 34012584)
|
||||
אוניברסיטת בוקרשט (revision 35188136)
|
||||
בוסניה והרצגובינה (revision 35162864)
|
||||
נצרות (revision 35210877)
|
||||
כלא שש (revision 35057829)
|
||||
אלפרד מנספלד (revision 35050837)
|
||||
אות (revision 34005221)
|
||||
י"א באייר (revision 34914962)
|
||||
5 באפריל (revision 35157784)
|
||||
ישראל (revision 35213935)
|
||||
קיילצה (revision 33935006)
|
||||
לותרני (revision 35064164)
|
||||
יום ראשון (revision 34281448)
|
||||
יהדות איטליה (revision 35198843)
|
||||
פרס דיזנגוף (revision 34534024)
|
||||
ה' בסיוון (revision 34566809)
|
||||
ח' בטבת (revision 35079706)
|
||||
האימפריה הרומית (revision 35119178)
|
||||
שגריר (revision 34965857)
|
||||
דן מכמן (revision 34522541)
|
||||
הספרייה הלאומית של צרפת (revision 34954915)
|
||||
דן ריזינגר (revision 34757254)
|
||||
אסטרטגיה צבאית (revision 35069854)
|
||||
אביבה ברושי (revision 35050673)
|
||||
טורקית (revision 34730801)
|
||||
11 במאי (revision 34445764)
|
||||
רב (revision 35062888)
|
||||
וולוז'ין (revision 35024306)
|
||||
ולאכיה (revision 33077945)
|
||||
יהדות הולנד (revision 33771623)
|
||||
אנגלית (revision 35222539)
|
||||
אוסטרליה (revision 35084368)
|
||||
חוק (revision 35117792)
|
||||
נצרות אורתודוקסית (revision 35181856)
|
||||
שבתאות (revision 35118251)
|
||||
הספרייה הלאומית של צ'כיה (revision 34679038)
|
||||
שימוש הוגן (revision 34698539)
|
||||
המאה ה-19 (revision 35228599)
|
||||
אולטניה (revision 35181527)
|
||||
תולדות עם ישראל (revision 35227911)
|
||||
1999 (revision 34550725)
|
||||
טוגאי ביי (revision 29009639)
|
||||
בית הדין העממי (רומניה) (revision 29292417)
|
||||
יהדות (revision 35238551)
|
||||
מוסלמים (revision 35186931)
|
||||
סלובניה (revision 34076843)
|
||||
1944 (revision 33848050)
|
||||
VIAF (revision 34992335)
|
||||
יחיאל שמי (revision 35169033)
|
||||
משפחת אוסטרוגסקי (revision 27522789)
|
||||
בוהמיה (revision 34774081)
|
||||
גולאג (revision 33926313)
|
||||
משה מוקדי (revision 33579655)
|
||||
קרן ויקימדיה (revision 35175443)
|
||||
ב' באלול (revision 33761030)
|
||||
רגולציה (revision 35168860)
|
||||
הקהילה היהודית הספרדית ברומניה (revision 32942827)
|
||||
הרתעה (אסטרטגיה) (revision 34184585)
|
||||
נובוגרודק (revision 34333750)
|
||||
מודל צבא העם (revision 34762715)
|
||||
מלחמת העולם השנייה (revision 35218209)
|
||||
חשוון (revision 35214064)
|
||||
1875 (revision 25165857)
|
||||
ליידי בירד ג'ונסון (revision 35156176)
|
||||
הספרייה הלאומית של ספרד (revision 34172052)
|
||||
רבנים (revision 16968274)
|
||||
בית סוהר מגידו (revision 33202574)
|
||||
גליציה (revision 34740074)
|
||||
יהדות בלארוס (revision 34770618)
|
||||
יהודים (revision 35220685)
|
||||
עמירם תמרי (revision 33235872)
|
||||
יהדות ליטא (revision 35062246)
|
||||
עלייה לרגל (revision 34764674)
|
||||
המועצה לישראל יפה (revision 34627430)
|
||||
יום שישי (revision 34737763)
|
||||
ג'מייקה (revision 35022818)
|
||||
למ"ד (revision 34438979)
|
||||
שיעה (revision 35141725)
|
||||
1987 (revision 32747521)
|
||||
שיטפון (revision 34831666)
|
||||
פרסית (revision 35135705)
|
||||
קניין רוחני (revision 34598306)
|
||||
תסריטאי (revision 34389192)
|
||||
גשם (revision 35214991)
|
||||
קצין (revision 35189304)
|
||||
שמואל וודניצקי (revision 33250304)
|
||||
בית חיים (revision 35213536)
|
||||
אליעזר פאפו (revision 34907056)
|
||||
יצחק דנציגר (revision 35163501)
|
||||
ירמיהו (revision 35170413)
|
||||
אלכסנדר סוורוס (revision 34549496)
|
||||
יוליסס סימפסון גרנט (revision 35099753)
|
||||
אלפבית עברי (revision 35167195)
|
||||
יום השבת (revision 32714481)
|
||||
ספרד (revision 35240234)
|
||||
קרואטיה (revision 35208639)
|
||||
יום כיפור קטן (revision 34566029)
|
||||
דתיים לאומיים (revision 35191810)
|
||||
לוניניץ (revision 34618951)
|
||||
מנצ'וריה (revision 35213350)
|
||||
ולנטיניאנוס הראשון (revision 35183518)
|
||||
מערכת התיעוד האוניברסיטאית (צרפת) (revision 34033122)
|
||||
מוזיאון תל אביב (revision 34779076)
|
||||
חוזה פריז (1783) (revision 34280442)
|
||||
דיקטטורה (revision 34987941)
|
||||
+ (revision 34951817)
|
||||
יוני (revision 33963139)
|
||||
כ"ג בסיוון (revision 34929216)
|
||||
דרג דיפלומטי (revision 33574252)
|
||||
אנציקלופדיה בריטניקה (revision 35145787)
|
||||
וגטיוס (revision 33391266)
|
||||
מהרי"ל (revision 34613180)
|
||||
מוזיאון סטדלייק (revision 33770681)
|
||||
ספרייה דיגיטלית (revision 34044215)
|
||||
עיצור שפתי-שיני, אפי (revision 34158419)
|
||||
פסנתרן (revision 34558921)
|
||||
צבא אוסטרליה (revision 34306538)
|
||||
בוואריה (revision 35069866)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 15:27:25.018656
|
||||
|
||||
94 characters appeared 1622917 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char י: 11.931047613648756 %
|
||||
[ 1] Char ו: 11.230395639456608 %
|
||||
[ 2] Char ה: 8.702909637399818 %
|
||||
[ 3] Char ר: 6.166673958064399 %
|
||||
[ 4] Char ל: 6.0917471441854385 %
|
||||
[ 5] Char ת: 5.755007803849488 %
|
||||
[ 6] Char ב: 5.319434080732409 %
|
||||
[ 7] Char מ: 5.124476482777616 %
|
||||
[ 8] Char א: 4.594258363181851 %
|
||||
[ 9] Char ש: 4.086160906565154 %
|
||||
[10] Char נ: 3.7032700994567187 %
|
||||
[11] Char ם: 2.782397374603877 %
|
||||
[12] Char ד: 2.615537331853693 %
|
||||
[13] Char ע: 2.5096785602714125 %
|
||||
[14] Char ק: 2.2712806631515967 %
|
||||
[15] Char פ: 2.233139464310251 %
|
||||
[16] Char ח: 2.124939229794253 %
|
||||
[17] Char ס: 2.0344232021723845 %
|
||||
[18] Char כ: 1.8796401787645332 %
|
||||
[19] Char ט: 1.596261546339092 %
|
||||
[20] Char ג: 1.5693347226013405 %
|
||||
[21] Char צ: 1.2939663581070382 %
|
||||
[22] Char ן: 1.2081948737982287 %
|
||||
[23] Char ז: 0.8376275558146227 %
|
||||
[24] Char ך: 0.3550397216863216 %
|
||||
[25] Char ף: 0.2127034222945474 %
|
||||
[26] Char e: 0.16427211003396971 %
|
||||
[27] Char ץ: 0.15817198291717938 %
|
||||
[28] Char a: 0.14005645390368085 %
|
||||
[29] Char i: 0.12958148814757625 %
|
||||
[30] Char n: 0.10296275163794574 %
|
||||
[31] Char r: 0.10246981207295258 %
|
||||
[32] Char t: 0.08983823572000293 %
|
||||
[33] Char o: 0.08287546436447459 %
|
||||
[34] Char s: 0.08238252479948142 %
|
||||
[35] Char l: 0.06894992165341789 %
|
||||
[36] Char u: 0.052744533454267835 %
|
||||
[37] Char c: 0.04947880883618817 %
|
||||
[38] Char d: 0.0451039701968739 %
|
||||
[39] Char h: 0.04196148047004252 %
|
||||
[40] Char m: 0.03327342063703812 %
|
||||
[41] Char g: 0.023414629337174975 %
|
||||
[42] Char p: 0.023291394445926684 %
|
||||
[43] Char y: 0.0219358106421955 %
|
||||
[44] Char b: 0.020025669827847016 %
|
||||
[45] Char C: 0.01990243493659873 %
|
||||
[46] Char A: 0.017930676676626102 %
|
||||
[47] Char B: 0.017437737111632944 %
|
||||
[48] Char I: 0.017437737111632944 %
|
||||
[49] Char k: 0.017437737111632944 %
|
||||
[50] Char v: 0.016390240536022484 %
|
||||
[51] Char f: 0.01632862309039834 %
|
||||
[52] Char S: 0.015958918416653468 %
|
||||
[53] Char M: 0.014418482276049855 %
|
||||
[54] Char D: 0.013432603146063538 %
|
||||
[55] Char T: 0.013186133363566959 %
|
||||
[56] Char L: 0.012754811244197948 %
|
||||
[57] Char P: 0.012508341461701369 %
|
||||
[58] Char R: 0.010906287875473607 %
|
||||
[59] Char E: 0.010598200647352883 %
|
||||
[60] Char z: 0.010536583201728738 %
|
||||
[61] Char w: 0.010474965756104595 %
|
||||
[62] Char N: 0.009304234289245846 %
|
||||
[63] Char G: 0.0086880598330044 %
|
||||
|
||||
The first 64 characters have an accumulated ratio of 0.9992796920606537.
|
||||
The first 5 characters have an accumulated ratio of 0.4412277399275502.
|
||||
All characters whose order is over 22 have an accumulated ratio of 0.031037939709794155.
|
||||
|
||||
1640 sequences found.
|
||||
|
||||
First 688 (typical positive ratio): 0.9950129360753337
|
||||
Next 328 (1016-688): 0.0039909002477918065
|
||||
Rest: 0.0009961636768744953
|
||||
|
||||
- Processing end: 2022-12-15 15:27:25.183725
|
||||
275
script/BuildLangModelLogs/LangHindiModel.log
Normal file
275
script/BuildLangModelLogs/LangHindiModel.log
Normal file
@ -0,0 +1,275 @@
|
||||
= Logs of language model for Hindi (hi) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 20:20:15.059984
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
मुखपृष्ठ (revision 5590190)
|
||||
भारत की स्वतंत्रता (revision 5658303)
|
||||
हिन्दी विकिपीडिया (revision 5671058)
|
||||
सेंट विंसेंट एंड ग्रेनाडाइन्स (revision 5559154)
|
||||
प्रोकैरियोटिक कोशिका (revision 5680074)
|
||||
कतिकी मेला (revision 4822652)
|
||||
कनाडा (revision 5715323)
|
||||
मुखपृष्ठ/अन्य भाषाओं में (revision 4949624)
|
||||
मोन्टाना (revision 5656758)
|
||||
कार्तिक पूर्णिमा (revision 5685731)
|
||||
श्रीविजय एयर उड़ान 182 (revision 5385509)
|
||||
कालीकट अंतर्राष्ट्रीय विमानक्षेत्र (revision 5053097)
|
||||
श्रद्धा कपूर (revision 5715102)
|
||||
द्वीप चाप (revision 5051722)
|
||||
ब्रिटिश कोलम्बिया (revision 5601415)
|
||||
न्यू मेक्सिको (revision 5476551)
|
||||
कॅण्टकी (revision 4910514)
|
||||
हवाई (revision 4924959)
|
||||
इंडोनेशिया (revision 5633388)
|
||||
सुन्दरवन (revision 5637997)
|
||||
आशिकी (revision 5715605)
|
||||
नीपोलिटाई विकिपीडिया (revision 4562757)
|
||||
बुन्देलखण्ड (revision 5714881)
|
||||
साहनिवाल विमानक्षेत्र (revision 4801866)
|
||||
लॉर्ड कॉर्नवालिस (revision 5450877)
|
||||
अगाति विमानक्षेत्र (revision 5377793)
|
||||
न्यूफाउंडलैंड (द्वीप) (revision 5193294)
|
||||
सृष्टि (revision 5709072)
|
||||
बिठूर (revision 5622562)
|
||||
सम्प्रभु राज्य (revision 5315656)
|
||||
वासीनाम (revision 5613691)
|
||||
भारत में समाजवाद (revision 5350981)
|
||||
जनसंख्या घनत्व के अनुसार देशों और अधीन क्षेत्रों की सूची (revision 5598563)
|
||||
भारत के प्रसिद्ध मेले (revision 5696005)
|
||||
लेह कुशोक बकुला रिम्पोची विमानक्षेत्र (revision 5398134)
|
||||
शिंजो आबे (revision 5704870)
|
||||
कार्तिक शुक्ल अष्टमी (revision 5680290)
|
||||
विकिपीडिया (revision 5709682)
|
||||
बेरियम (revision 5569495)
|
||||
योगिनी एकादशी (revision 5398349)
|
||||
बांदा जिला (revision 5381573)
|
||||
जावा सागर (revision 5152752)
|
||||
अलाबामा (revision 5656049)
|
||||
पूर्णिमा (revision 5638603)
|
||||
भारत गणराज्य का इतिहास (revision 5704342)
|
||||
१४ मार्च (revision 4808978)
|
||||
मुम्बई (revision 5668953)
|
||||
गुजरात (revision 5712277)
|
||||
हरतालिका व्रत (revision 5620766)
|
||||
किंग खालिद हवाई अड्डा (revision 3497420)
|
||||
फ़्रान्सीसी विकिपीडिया (revision 5684016)
|
||||
अलकनन्दा नदी (revision 5460363)
|
||||
अस्पताल दुर्घटना, बगदाद (revision 5664521)
|
||||
छत्तीसगढ़ (revision 5712261)
|
||||
अल्बर्टा (revision 5477015)
|
||||
फ़िल्मफ़ेयर पुरस्कार (revision 5658392)
|
||||
औपनिवेशिक भारत (revision 5613904)
|
||||
त्रिशूल पर्वत (revision 5544055)
|
||||
स्त्री (revision 5667699)
|
||||
जकार्ता (revision 5244613)
|
||||
जोस मारती हवाई अड्डा (revision 2467543)
|
||||
मासेचुसेट्स (revision 5596477)
|
||||
सकल घरेलू उत्पाद (revision 5642814)
|
||||
ए फ़्लाइंग जट्ट (revision 4966138)
|
||||
पश्चिम कालिमंतान (revision 5414352)
|
||||
ओटावा (revision 5422563)
|
||||
कलिंजर (revision 5671122)
|
||||
हैदराबाद (revision 5683737)
|
||||
कालिंजर दुर्ग (revision 5671122)
|
||||
कानपुर (revision 5626845)
|
||||
अभिनेत्री (revision 5628024)
|
||||
उर्दू विकिपीडिया (revision 5234893)
|
||||
एयर इंडिया एक्सप्रेस उड़ान ८१२ (revision 4821073)
|
||||
रोड आइलैण्ड (revision 3532775)
|
||||
उत्तराखंड (revision 5714067)
|
||||
२००३ (revision 5163976)
|
||||
कॅरीबियाई सागर (revision 4826292)
|
||||
कॉलोराडो (revision 4534558)
|
||||
८ सितम्बर (revision 5633733)
|
||||
शक्ति कपूर (revision 5473336)
|
||||
बास्क विकिपीडिया (revision 5092568)
|
||||
हैतियाई क्रियोल विकिपीडिया (revision 5161239)
|
||||
मकर संक्रांति (revision 5481015)
|
||||
गंगा नदी (revision 5703207)
|
||||
उत्तर-पश्चिम (revision 5592378)
|
||||
एबीसीडी 2 (revision 5216546)
|
||||
फेसबुक (revision 5691471)
|
||||
अंग्रेज़ी भाषा (revision 5681103)
|
||||
बत्ती गुल मीटर चालू (revision 5567871)
|
||||
नॉर्थ कैरोलीना (revision 3530544)
|
||||
अलास्का (revision 5450294)
|
||||
सिद्धान्त कपूर (revision 5385513)
|
||||
फ़्लोरिडा (revision 5441902)
|
||||
इमरान ख़ान (अभिनेता) (revision 5614601)
|
||||
गोलमेज सम्मेलन (भारत) (revision 5673517)
|
||||
फेरीहेगी हवाई अड्डा (revision 2482999)
|
||||
सिंधु-गंगा-ब्रह्मपुत्र का मैदान (revision 4962707)
|
||||
उत्तरी अमेरिका (revision 5472773)
|
||||
सोमवती अमावस्या (revision 5593417)
|
||||
आंग्ल-मैसूर युद्ध (revision 4018741)
|
||||
दोआब (revision 5684817)
|
||||
१७ जनवरी (revision 4809344)
|
||||
सकल घरेलू उत्पाद के अनुसार देशों की सूची (पीपीपी) (revision 5560304)
|
||||
जर्मन विकिपीडिया (revision 4757076)
|
||||
भारत में कम्पनी शासन (revision 5506915)
|
||||
वर्ग संघर्ष (revision 5676082)
|
||||
चार्ल्स तृतीय (revision 5672055)
|
||||
मुत्तुलक्ष्मी रेड्डी (revision 5628933)
|
||||
उत्तर प्रदेश (revision 5711856)
|
||||
शिंजो आबे की हत्या (revision 5705526)
|
||||
स्ट्रीट डांसर (revision 5111042)
|
||||
जनसंख्या के अनुसार देशों की सूची (revision 5713053)
|
||||
हलवारा एयर फ़ोर्स स्टेशन (revision 4951493)
|
||||
अहमदाबाद (revision 5711144)
|
||||
सरस्वती नदी (revision 5688450)
|
||||
मीका सिंह (revision 5594882)
|
||||
मंगलवार व्रत कथा (revision 5686278)
|
||||
सेबुआनो विकिपीडिया (revision 4611299)
|
||||
ख़िलजी वंश (revision 5682608)
|
||||
नेब्रास्का (revision 4584163)
|
||||
कंगना रनौत (revision 5680317)
|
||||
स्कॉटलैंड (revision 5556244)
|
||||
केन्सास (revision 4826538)
|
||||
टोंस नदी (revision 5610286)
|
||||
गाम्बिया (revision 5105416)
|
||||
कज़ाख़ विकिपीडिया (revision 4518946)
|
||||
१९२६ (revision 4538959)
|
||||
यतीन्द्र मोहन सेनगुप्त (revision 5284794)
|
||||
शुंग राजवंश (revision 5713142)
|
||||
कोटोका हवाई अड्डा (revision 4725941)
|
||||
पापुआ न्यू गिनी (revision 5599432)
|
||||
प्रिन्स एड्वर्ड आइलैण्ड (revision 4773952)
|
||||
होमी वाडिया (revision 5199599)
|
||||
स्पेनी भाषा (revision 5599387)
|
||||
क्रोएशियाई विकिपीडिया (revision 5075650)
|
||||
ग्रह (revision 5688287)
|
||||
भारतीय रुपया (revision 5668341)
|
||||
१८९१ (revision 4554755)
|
||||
सन्त पियर और मिकलान (revision 4544887)
|
||||
उत्तराखण्ड (revision 5714067)
|
||||
सीसा (revision 5605646)
|
||||
झारखण्ड (revision 5706129)
|
||||
फ़िल्मफ़ेयर महिला प्रथम अभिनय पुरस्कार (revision 5381963)
|
||||
हरिद्वार जिला (revision 5597326)
|
||||
शाहिद कपूर (revision 5598756)
|
||||
उर्दू (revision 5674668)
|
||||
४ मार्च (revision 5486757)
|
||||
जनसंख्या (revision 5669984)
|
||||
रमा एकादशी (revision 5358633)
|
||||
सरोजिनी नायडू (revision 4940524)
|
||||
दक्षिणी केरोलाइना (revision 4962382)
|
||||
शनिवार व्रत कथा (revision 5143692)
|
||||
डीडी सह्याद्री (revision 5593159)
|
||||
कुर्नूल जिला (revision 5115081)
|
||||
अलीगढ़ जिला (revision 5703336)
|
||||
७ जनवरी (revision 5381367)
|
||||
इंग्लैण्ड (revision 5685229)
|
||||
विकिडाटा (revision 5414193)
|
||||
New York City (revision 5698022)
|
||||
पापुआ (प्रांत) (revision 5295334)
|
||||
म्यान्मार (revision 5715413)
|
||||
कॉलेज (revision 5657658)
|
||||
चीनी विकिपीडिया (revision 4755371)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 20:23:15.737540
|
||||
|
||||
83 characters appeared 832927 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char ा: 8.632209065140161 %
|
||||
[ 1] Char र: 6.632393955292601 %
|
||||
[ 2] Char क: 6.426133382637373 %
|
||||
[ 3] Char ्: 6.190098291927144 %
|
||||
[ 4] Char े: 5.538180416771218 %
|
||||
[ 5] Char ि: 4.2936535854882845 %
|
||||
[ 6] Char स: 4.009715137100851 %
|
||||
[ 7] Char न: 3.86132278098801 %
|
||||
[ 8] Char ं: 3.6268484513048564 %
|
||||
[ 9] Char त: 3.5399260679507325 %
|
||||
[10] Char ी: 3.4780959195703827 %
|
||||
[11] Char म: 3.2890037182130007 %
|
||||
[12] Char ह: 2.9841750837708467 %
|
||||
[13] Char य: 2.831460620198409 %
|
||||
[14] Char ल: 2.57129376283876 %
|
||||
[15] Char प: 2.498538287268872 %
|
||||
[16] Char व: 2.2905968950460243 %
|
||||
[17] Char ो: 2.1803831548262935 %
|
||||
[18] Char द: 1.9969337048744966 %
|
||||
[19] Char ज: 1.7171973054061163 %
|
||||
[20] Char ग: 1.5075750936156471 %
|
||||
[21] Char ै: 1.3684272451247228 %
|
||||
[22] Char ब: 1.325926521772016 %
|
||||
[23] Char ु: 1.2974726476630005 %
|
||||
[24] Char श: 1.0392267269520619 %
|
||||
[25] Char ।: 1.0391066684115176 %
|
||||
[26] Char अ: 0.894676244136641 %
|
||||
[27] Char ट: 0.8455723010539938 %
|
||||
[28] Char भ: 0.8257626418641729 %
|
||||
[29] Char थ: 0.8191594221342327 %
|
||||
[30] Char ड: 0.7635723178621896 %
|
||||
[31] Char ू: 0.7341579754288191 %
|
||||
[32] Char च: 0.6675254854266941 %
|
||||
[33] Char ए: 0.6664449585617948 %
|
||||
[34] Char ष: 0.6245445279118098 %
|
||||
[35] Char ध: 0.62070265461439 %
|
||||
[36] Char उ: 0.5550306329366199 %
|
||||
[37] Char इ: 0.5436250715849048 %
|
||||
[38] Char ख: 0.5100086802324814 %
|
||||
[39] Char ण: 0.5062868654756059 %
|
||||
[40] Char औ: 0.4841960940154419 %
|
||||
[41] Char आ: 0.4615050298525561 %
|
||||
[42] Char ़: 0.4564625711496926 %
|
||||
[43] Char फ: 0.337004323308045 %
|
||||
[44] Char ई: 0.3067495710908639 %
|
||||
[45] Char ँ: 0.22174812438545033 %
|
||||
[46] Char ौ: 0.16676131281612913 %
|
||||
[47] Char ृ: 0.16676131281612913 %
|
||||
[48] Char ॉ: 0.15523569292386968 %
|
||||
[49] Char छ: 0.14719177070739695 %
|
||||
[50] Char ०: 0.13110392627445142 %
|
||||
[51] Char घ: 0.13074375065281832 %
|
||||
[52] Char ओ: 0.12065883324709127 %
|
||||
[53] Char १: 0.11321520373334037 %
|
||||
[54] Char ठ: 0.10865297919265433 %
|
||||
[55] Char ढ: 0.08608197357031289 %
|
||||
[56] Char २: 0.08320056859724802 %
|
||||
[57] Char ५: 0.04994435286645769 %
|
||||
[58] Char ऑ: 0.048863826001558364 %
|
||||
[59] Char ९: 0.04838359183938088 %
|
||||
[60] Char ऊ: 0.046942889352848446 %
|
||||
[61] Char झ: 0.045021952704138536 %
|
||||
[62] Char ६: 0.04166031356889619 %
|
||||
[63] Char ८: 0.04069984524454124 %
|
||||
[64] Char ७: 0.03721814756875452 %
|
||||
[65] Char ऐ: 0.03721814756875452 %
|
||||
[66] Char ३: 0.03685797194712142 %
|
||||
[67] Char ॰: 0.03325621573079033 %
|
||||
[68] Char ४: 0.032535864487524116 %
|
||||
[69] Char ञ: 0.028934108271193033 %
|
||||
[70] Char ः: 0.028453874109015558 %
|
||||
[71] Char ऋ: 0.012125912594981313 %
|
||||
[72] Char ऍ: 0.003961931837964191 %
|
||||
[73] Char ॅ: 0.0024011708108873887 %
|
||||
[74] Char ङ: 0.0022811122703430193 %
|
||||
[75] Char ऽ: 0.0007203512432662167 %
|
||||
[76] Char ॥: 0.0006002927027218472 %
|
||||
[77] Char ऎ: 0.00048023416217747773 %
|
||||
[78] Char ॆ: 0.00036017562163310834 %
|
||||
[79] Char ळ: 0.00036017562163310834 %
|
||||
[80] Char ॄ: 0.00024011708108873886 %
|
||||
[81] Char ॠ: 0.00012005854054436943 %
|
||||
[82] Char ऱ: 0.00012005854054436943 %
|
||||
|
||||
The first 83 characters have an accumulated ratio of 0.9999999999999998.
|
||||
The first 7 characters have an accumulated ratio of 0.4172238383435763.
|
||||
All characters whose order is over 41 have an accumulated ratio of 0.033113346067542536.
|
||||
|
||||
2239 sequences found.
|
||||
|
||||
First 1379 (typical positive ratio): 0.9950003274894658
|
||||
Next 426 (1805-1379): 0.004003168850449601
|
||||
Rest: 0.0009965036600846355
|
||||
|
||||
- Processing end: 2022-12-14 20:23:16.458133
|
||||
238
script/BuildLangModelLogs/LangHungarianModel.log
Normal file
238
script/BuildLangModelLogs/LangHungarianModel.log
Normal file
@ -0,0 +1,238 @@
|
||||
= Logs of language model for Hungarian (hu) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:13:09.328100
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Kezdőlap (revision 21016160)
|
||||
Harrison Schmitt (revision 25091016)
|
||||
Hermann Nuber (revision 25593806)
|
||||
Űrhajó (revision 25525898)
|
||||
2003 (revision 25433016)
|
||||
World Chess Hall of Fame (revision 25593350)
|
||||
Théophile Thoré-Bürger (revision 24284108)
|
||||
Kazahsztán (revision 25514356)
|
||||
Holland aranykor (revision 25587241)
|
||||
1897 (revision 25545732)
|
||||
Űrszonda (revision 25472795)
|
||||
Sándor Júlia (színművész) (revision 25600510)
|
||||
Uwe Seeler (revision 25522695)
|
||||
Republikánus Párt (Amerikai Egyesült Államok) (revision 25560224)
|
||||
Nemzetközi Virtuális Katalógustár (revision 23032870)
|
||||
Gerrit Dou (revision 22591451)
|
||||
1842 (revision 22164991)
|
||||
Iszlám Konferencia Szervezete (revision 25583883)
|
||||
Groovehouse (revision 25232046)
|
||||
1840-es évek (revision 14636883)
|
||||
Mars Odyssey (revision 25521678)
|
||||
Október 15. (revision 25582677)
|
||||
Szeptember 16. (revision 25282517)
|
||||
Tata (revision 25553218)
|
||||
Apollo–14 (revision 25179663)
|
||||
Nemzetközi Szabványos Névazonosító (revision 25150715)
|
||||
Új-Mexikó (revision 25499464)
|
||||
Harry Nelson Pillsbury (revision 25593354)
|
||||
Kocsák Tibor (revision 24620283)
|
||||
Erich Juskowiak (revision 24796419)
|
||||
1984 (revision 25589154)
|
||||
Led Zeppelin (revision 25472470)
|
||||
Günter Sawitzki (revision 23354864)
|
||||
David Scott (revision 25436255)
|
||||
Európai Unió (revision 25577990)
|
||||
Bakonyi Károly (író) (revision 22870612)
|
||||
Wehrmacht (revision 23964625)
|
||||
Automated Transfer Vehicle (revision 23558751)
|
||||
Ultimate Toni Braxton (revision 22861977)
|
||||
Zerkovitz Béla (revision 25089941)
|
||||
Sojourner (revision 25292375)
|
||||
Huygens űrszonda (revision 25292343)
|
||||
Missouri (állam) (revision 25499958)
|
||||
John Young (űrhajós) (revision 23542603)
|
||||
CST–100 Starliner (revision 24958423)
|
||||
Kisváros (televíziós sorozat) (revision 25463748)
|
||||
Unique (revision 25488615)
|
||||
1866 (revision 25577779)
|
||||
Prohászka Lajos (revision 25119494)
|
||||
December 13. (revision 25596291)
|
||||
Montana (állam) (revision 25485620)
|
||||
Paul Delaroche (revision 25105526)
|
||||
SpaceShipOne (revision 24595922)
|
||||
Eric Knight (revision 25211765)
|
||||
Space Shuttle (revision 25373392)
|
||||
December 14. (revision 25589024)
|
||||
Öszkemen (revision 23574721)
|
||||
Molnár Ferenc (író) (revision 25562912)
|
||||
H–II Transfer Vehicle (revision 22613162)
|
||||
Beagle 2 (revision 22854624)
|
||||
Naprendszer-kutatás (revision 25540138)
|
||||
Heinz Kwiatkowski (revision 16523678)
|
||||
Herbert Erhardt (revision 21170335)
|
||||
Norvégia (revision 25417273)
|
||||
Április 30. (revision 25477768)
|
||||
Deep Space–2 (revision 24769213)
|
||||
Gyémántcsiszolás (revision 23253547)
|
||||
Nyikolaj Vasziljevics Gogol (revision 25438128)
|
||||
Országgyűlési Könyvtár (Japán) (revision 22240125)
|
||||
Gustave Courbet (revision 25056702)
|
||||
São Tomé (revision 18440573)
|
||||
Francia Nemzeti Könyvtár (revision 25053605)
|
||||
1940-es évek (revision 25468852)
|
||||
1954 (revision 25472761)
|
||||
Új-Amszterdam (revision 25553458)
|
||||
Helmut Rahn (revision 25568751)
|
||||
Németország (revision 25584000)
|
||||
Szeptember (revision 25345876)
|
||||
Arizona (revision 25141534)
|
||||
Je Huj-mej (revision 24141634)
|
||||
1933-as sakkolimpia (revision 19052157)
|
||||
Octaviano A. Larrazolo (revision 25595801)
|
||||
Kongresszusi Könyvtár (revision 23872247)
|
||||
Carel Fabritius (revision 24719147)
|
||||
1750 (revision 21462369)
|
||||
Óceánia (revision 24068092)
|
||||
Curaçao (revision 25405882)
|
||||
Monica Arnold (revision 25485521)
|
||||
Wilhelm Steinitz (revision 25050687)
|
||||
Romhányi József (költő) (revision 23920334)
|
||||
James Irwin (revision 22686505)
|
||||
Űreszköz (revision 23734283)
|
||||
Joanne Kathleen Rowling (revision 25434419)
|
||||
Georges Feydeau (revision 25584375)
|
||||
1980 (revision 25477733)
|
||||
Frízföld (revision 24959492)
|
||||
Német labdarúgó-válogatott (revision 25559026)
|
||||
Skylab-program (revision 25502299)
|
||||
Simon de Vlieger (revision 19720677)
|
||||
Vegyifegyver-tilalmi Szervezet (revision 21172811)
|
||||
Szigeti József (színművész) (revision 24996854)
|
||||
Voszhod (űrhajó) (revision 25558337)
|
||||
Kínai Köztársaság (revision 25017788)
|
||||
Howard Staunton (revision 25481088)
|
||||
Április 5. (revision 25177790)
|
||||
Holdra szállások emberrel (revision 24674534)
|
||||
1970 (revision 25393037)
|
||||
Kopt naptár (revision 25412286)
|
||||
1869 (revision 25571248)
|
||||
Eugène Delacroix (revision 24418419)
|
||||
Number Ones (revision 25419833)
|
||||
Frans Hals (revision 24664499)
|
||||
Párizs (revision 25523874)
|
||||
19. század (revision 25584168)
|
||||
TKSZ (revision 24419778)
|
||||
Joachim József (képzőművész) (revision 25379047)
|
||||
Fritz Walter (labdarúgó, 1920) (revision 24942897)
|
||||
SpaceShipTwo (revision 24783264)
|
||||
Abraham van Dijck (revision 24554516)
|
||||
Kornis Mihály (író) (revision 25567608)
|
||||
Tiencsou (űrhajó) (revision 24392971)
|
||||
Alcazar (revision 25135995)
|
||||
Alekszandr Alekszandrovics Aljechin (revision 25489078)
|
||||
Galileo (revision 25442481)
|
||||
Jet Propulsion Laboratory (revision 23470554)
|
||||
Sepp Herberger (revision 24594437)
|
||||
Október 25. (revision 25499623)
|
||||
1976-os sakkolimpia (revision 18534577)
|
||||
Március 17. (revision 25358861)
|
||||
Phoenix űrszonda (revision 25521690)
|
||||
1930-as évek (revision 25468850)
|
||||
Orbiter (revision 21595964)
|
||||
Origo.hu (revision 25454114)
|
||||
Február 27. (revision 25466236)
|
||||
Besançon (revision 24962532)
|
||||
Por (revision 25545714)
|
||||
Június 17. (revision 25587143)
|
||||
Magyarország (revision 25601041)
|
||||
Dennis Chavez (revision 25595172)
|
||||
Vígszínház (revision 25525908)
|
||||
Michigan (revision 25491956)
|
||||
ISSN (revision 25077488)
|
||||
1958-as labdarúgó-világbajnokság (revision 24895049)
|
||||
Hordozórakéta (revision 24212311)
|
||||
Comore-szigetek (revision 25521695)
|
||||
Hans Cieslarczyk (revision 22742075)
|
||||
Vírusháború (revision 21067008)
|
||||
Szökőnap (revision 25517785)
|
||||
Enterprise űrrepülőgép (revision 25431966)
|
||||
Louvre (revision 25010548)
|
||||
Kozmikus por (revision 24318589)
|
||||
Iványi Ödön (revision 23112343)
|
||||
Johannes Fabritius (revision 21073945)
|
||||
Union List of Artist Names (revision 22813546)
|
||||
Karel van Mander (revision 24479929)
|
||||
Amszterdam (revision 25510137)
|
||||
Albuquerque (revision 25449314)
|
||||
Űrrepülés (revision 25477124)
|
||||
Heinz Wewers (revision 20183927)
|
||||
Budapest (revision 25558411)
|
||||
Horst Eckel (revision 24441000)
|
||||
1944 (revision 25556832)
|
||||
Neil Armstrong (revision 25293956)
|
||||
Füst Milán (revision 25575179)
|
||||
Guglielmo Marconi (revision 25188427)
|
||||
Integrált katalógustár (revision 22941517)
|
||||
Jogi személy (revision 25094378)
|
||||
Jean Auguste Dominique Ingres (revision 24884797)
|
||||
Texas (revision 25207338)
|
||||
1975 (revision 25540476)
|
||||
MSV Duisburg (revision 24591734)
|
||||
Saint-Maurice (Val-de-Marne) (revision 22828244)
|
||||
Második világháború (revision 25573355)
|
||||
Bilicsi Tivadar (revision 25556818)
|
||||
1930 (revision 25265852)
|
||||
A legnagyobb könyvtárak listája (revision 24664862)
|
||||
Eötvös Károly (revision 25572259)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:16:34.281907
|
||||
|
||||
60 characters appeared 1757438 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 9.670497622106726 %
|
||||
[ 1] Char a: 9.174719108156305 %
|
||||
[ 2] Char t: 7.571533106715571 %
|
||||
[ 3] Char s: 6.333025688530691 %
|
||||
[ 4] Char l: 6.128125145808842 %
|
||||
[ 5] Char n: 5.484062595664826 %
|
||||
[ 6] Char r: 4.979919633011236 %
|
||||
[ 7] Char k: 4.555779492647821 %
|
||||
[ 8] Char i: 4.5097465742745975 %
|
||||
[ 9] Char o: 4.199067051014032 %
|
||||
[10] Char z: 4.084923621772147 %
|
||||
[11] Char á: 3.5802685500142823 %
|
||||
[12] Char é: 3.1796285274359604 %
|
||||
[13] Char g: 3.1547627853727986 %
|
||||
[14] Char m: 3.0938218019639954 %
|
||||
[15] Char b: 2.252711048697024 %
|
||||
[16] Char d: 2.0156614344289814 %
|
||||
[17] Char v: 1.937365642486392 %
|
||||
[18] Char y: 1.9164260702226765 %
|
||||
[19] Char h: 1.412169305545914 %
|
||||
[20] Char p: 1.3879294746101996 %
|
||||
[21] Char u: 1.3544147787859373 %
|
||||
[22] Char j: 1.2084067830557892 %
|
||||
[23] Char ö: 1.0528394173791622 %
|
||||
[24] Char ó: 1.0058960828205603 %
|
||||
[25] Char f: 0.9902483046343598 %
|
||||
[26] Char c: 0.9528074390106507 %
|
||||
[27] Char ő: 0.8960202294476391 %
|
||||
[28] Char í: 0.597233017608587 %
|
||||
[29] Char ü: 0.5129057184378624 %
|
||||
[30] Char ú: 0.310508820225806 %
|
||||
[31] Char ű: 0.2789856598070601 %
|
||||
|
||||
The first 32 characters have an accumulated ratio of 0.9978241053169444.
|
||||
The first 6 characters have an accumulated ratio of 0.4436196326698296.
|
||||
All characters whose order is over 25 have an accumulated ratio of 0.03548460884537605.
|
||||
|
||||
1221 sequences found.
|
||||
|
||||
First 713 (typical positive ratio): 0.9950428169725475
|
||||
Next 143 (856-713): 0.003957871411948477
|
||||
Rest: 0.0009993116155040394
|
||||
|
||||
- Processing end: 2022-12-15 00:16:34.398122
|
||||
222
script/BuildLangModelLogs/LangIrishModel.log
Normal file
222
script/BuildLangModelLogs/LangIrishModel.log
Normal file
@ -0,0 +1,222 @@
|
||||
= Logs of language model for Irish (ga) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 23:56:43.375514
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Tracy Caldwell Dyson (revision 972597)
|
||||
Ceimiceoir (revision 1069325)
|
||||
14 Lúnasa (revision 1096367)
|
||||
NASA (revision 1105064)
|
||||
1969 (revision 1120841)
|
||||
Fisiceoir (revision 1070391)
|
||||
Rúisis (revision 1106700)
|
||||
Stáit Aontaithe Mheiriceá (revision 1117044)
|
||||
Ceimic (revision 1118628)
|
||||
Tointeálaí spáis (revision 1049998)
|
||||
Comhdhúil (revision 1026330)
|
||||
An Sciath (revision 1107011)
|
||||
An Conradh Versailles (revision 1085221)
|
||||
James Watt (revision 1118273)
|
||||
An tAontas Sóivéadach (revision 1012309)
|
||||
17 Meán Fómhair (revision 1120902)
|
||||
An tSanscrait (revision 1060099)
|
||||
Paul Rudd (revision 1069358)
|
||||
Hidrigin (revision 1047874)
|
||||
Torstein Slungård (revision 1071382)
|
||||
Fyodor Dostoyevsky (revision 1118623)
|
||||
Náisiúin Aontaithe (revision 1059374)
|
||||
Inneall dócháin inmheánaigh (revision 894804)
|
||||
Slavaigh (revision 1113298)
|
||||
Radaighníomhaíocht (revision 1119072)
|
||||
2003 (revision 1120836)
|
||||
1 Eanáir (revision 1105589)
|
||||
Teileascóp Spáis Hubble (revision 1110496)
|
||||
Washington (revision 1105710)
|
||||
Michael Loewe (revision 1084656)
|
||||
1986 (revision 1116382)
|
||||
Gníomhaireacht Spáis na hEorpa (revision 1118858)
|
||||
Fisic chandamach (revision 1045815)
|
||||
26 Márta (revision 1121038)
|
||||
1980í (revision 740211)
|
||||
Henri Becquerel (revision 1056324)
|
||||
Coiscéim (revision 1106395)
|
||||
2005 (revision 1095195)
|
||||
Guglielmo Marconi (revision 1063391)
|
||||
Pápa Urban VII (revision 1073248)
|
||||
Aoine (revision 1051861)
|
||||
Nathaniel Hawthorne (revision 1038559)
|
||||
Steve Martin (revision 1057250)
|
||||
Comhdháil na Stát Aontaithe (revision 1093715)
|
||||
15 Samhain (revision 1121034)
|
||||
Benjamin Franklin (revision 998375)
|
||||
1802 (revision 1120813)
|
||||
Scríbhneoir (revision 1058515)
|
||||
Georg Ohm (revision 1031426)
|
||||
Mississippi (revision 989867)
|
||||
Géarchéim airgeadais 2007-2008 (revision 1107877)
|
||||
1988 (revision 1120757)
|
||||
An Bhealarúisis (revision 1106925)
|
||||
Anraí IV, Impire Naofa Rómhánach (revision 981853)
|
||||
Inneall (revision 656989)
|
||||
Pedro Castillo (revision 1105978)
|
||||
5 Lúnasa (revision 1096166)
|
||||
Patrick Hogan (Ceann Comhairle) (revision 1000969)
|
||||
Gale Sondergaard (revision 1082164)
|
||||
Nasc hidrigine (revision 1037892)
|
||||
12 Lúnasa (revision 1096374)
|
||||
Jack Black (revision 1057573)
|
||||
Réabhlóid Thionsclaíoch (revision 855713)
|
||||
Lúnasa (revision 970011)
|
||||
Spacelab (revision 691109)
|
||||
An tSínis (revision 1059378)
|
||||
Galileo Galilei (revision 1115720)
|
||||
André-Marie Ampère (revision 1107040)
|
||||
Robert Millikan (revision 995498)
|
||||
George W. Bush (revision 981498)
|
||||
Seanad Stáit Aontaithe Mheiriceá (revision 1093725)
|
||||
Stáisiún Spáis Idirnáisiúnta (revision 1070335)
|
||||
An Domhan (revision 1070377)
|
||||
Ed Miliband (revision 1103646)
|
||||
Roicéad (revision 878524)
|
||||
16 Lúnasa (revision 1096217)
|
||||
An Cheimic Orgánach (revision 1108626)
|
||||
Daniel Gabriel Fahrenheit (revision 992356)
|
||||
Missouri (revision 1109999)
|
||||
13 Deireadh Fómhair (revision 1120931)
|
||||
1956 (revision 1120816)
|
||||
An Béarla (revision 1114272)
|
||||
Max Planck (revision 1109772)
|
||||
Stephen Hawking (revision 1018555)
|
||||
1890 (revision 1107161)
|
||||
Clár Apollo (revision 1023565)
|
||||
10ú haois (revision 739954)
|
||||
Matamaiticeoir (revision 664190)
|
||||
Lá (revision 1094718)
|
||||
Liosta curaidh dornálaíochta marcmheáchan (revision 1061059)
|
||||
Scannán (revision 1053664)
|
||||
20 Márta (revision 1121020)
|
||||
José Nápoles (revision 1058234)
|
||||
Port Láirge (revision 1109632)
|
||||
Kevin Durant (revision 1070912)
|
||||
Dornálaí (revision 1113619)
|
||||
An Ghealach (revision 1094227)
|
||||
Uttar Pradesh (revision 1089347)
|
||||
Cumann Ríoga (revision 1033135)
|
||||
1938 (revision 1120849)
|
||||
13 Feabhra (revision 1096343)
|
||||
Páirtí Daonlathach (Stáit Aontaithe) (revision 1057384)
|
||||
Bialann (revision 865145)
|
||||
An Iaráic (revision 1069477)
|
||||
Sruth leictreach (revision 1095161)
|
||||
An tSualainn (revision 1119697)
|
||||
Discogs (revision 1082211)
|
||||
Peiriú (revision 983642)
|
||||
Nasc comhfhiúsach (revision 1006964)
|
||||
Núicléas adamhach (revision 1001970)
|
||||
Litríocht (revision 858847)
|
||||
Joseph McLoughlin (revision 999649)
|
||||
Geansaí (oileán) (revision 977009)
|
||||
Nasc dúbailte (revision 1021914)
|
||||
7 Lúnasa (revision 1096108)
|
||||
Fuinneamh (revision 889979)
|
||||
Éamann Iognáid Rís (revision 1111361)
|
||||
1770í (revision 1047266)
|
||||
Shane Duffy (revision 1068009)
|
||||
19ú haois (revision 1083522)
|
||||
Cory Booker (revision 1104516)
|
||||
Austin, Texas (revision 1107884)
|
||||
An Eala (revision 1103781)
|
||||
Esteban Ocon (revision 1008066)
|
||||
An spás amuigh (revision 1067792)
|
||||
17 Márta (revision 1096197)
|
||||
Gdańsk (revision 1120368)
|
||||
An Rómáinis (revision 1059147)
|
||||
An Rómáin (revision 1098710)
|
||||
Buddy Hackett (revision 1072216)
|
||||
Cruthaitheoir YouTube (revision 1104902)
|
||||
Innealtóir (revision 1058516)
|
||||
13 Samhain (revision 1100784)
|
||||
Lá Idirnáisiúnta ár Máthair-Chruinne (revision 1072131)
|
||||
IMDb (revision 1120126)
|
||||
James Madison (revision 1043959)
|
||||
Leabhar (revision 1119032)
|
||||
Glaschú (revision 1049306)
|
||||
Líne ghinealaigh Ind-Eorpaise (revision 1063677)
|
||||
Liosta Institiúidí Pleanála Teanga ar fud an Domhain (revision 652223)
|
||||
An Fhrainc (revision 1116453)
|
||||
1917 (revision 1095292)
|
||||
1993 (revision 1120754)
|
||||
Sciath (revision 996696)
|
||||
17 Eanáir (revision 1116335)
|
||||
Victor Hugo (revision 1025016)
|
||||
Stróc (revision 1064531)
|
||||
An Ghearmáin (revision 1100272)
|
||||
18 Lúnasa (revision 1096169)
|
||||
Leabharlann (revision 784239)
|
||||
1979 (revision 1119017)
|
||||
1923 (revision 1111031)
|
||||
Mais adamhach (revision 1109903)
|
||||
Seanfhocal (revision 944374)
|
||||
An Spáinnis (revision 1117234)
|
||||
1882 (revision 1120877)
|
||||
Minicíocht (revision 1095200)
|
||||
Aibhléis (revision 1059436)
|
||||
19 Bealtaine (revision 1120900)
|
||||
1 Deireadh Fómhair (revision 1120890)
|
||||
10 Meitheamh (revision 1100872)
|
||||
An Chóiré Theas (revision 976979)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 23:59:16.710065
|
||||
|
||||
55 characters appeared 447829 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 15.43379280930891 %
|
||||
[ 1] Char i: 10.336088105057957 %
|
||||
[ 2] Char n: 8.36011066724129 %
|
||||
[ 3] Char h: 7.38295197497259 %
|
||||
[ 4] Char r: 6.3988709976352585 %
|
||||
[ 5] Char e: 6.346842209861353 %
|
||||
[ 6] Char s: 5.180325526037841 %
|
||||
[ 7] Char t: 4.859667417697381 %
|
||||
[ 8] Char c: 4.694649073641948 %
|
||||
[ 9] Char o: 4.064274533359832 %
|
||||
[10] Char l: 4.049536765149197 %
|
||||
[11] Char d: 3.074611068063926 %
|
||||
[12] Char g: 2.827418501258293 %
|
||||
[13] Char m: 2.7986128633920537 %
|
||||
[14] Char u: 2.072666129259159 %
|
||||
[15] Char b: 1.99674429302256 %
|
||||
[16] Char á: 1.979103631073468 %
|
||||
[17] Char í: 1.7057850206217104 %
|
||||
[18] Char é: 1.3335447235440314 %
|
||||
[19] Char f: 1.1861670414376917 %
|
||||
[20] Char ó: 0.9369647789669718 %
|
||||
[21] Char ú: 0.9101688367658191 %
|
||||
[22] Char p: 0.8715380200924907 %
|
||||
[23] Char y: 0.2809107940754172 %
|
||||
[24] Char k: 0.2456294701772328 %
|
||||
[25] Char v: 0.24183337836540286 %
|
||||
[26] Char w: 0.18221240696783816 %
|
||||
[27] Char j: 0.11298955628152711 %
|
||||
[28] Char z: 0.05359188440230534 %
|
||||
[29] Char x: 0.033048328714754965 %
|
||||
[30] Char q: 0.014961067728976908 %
|
||||
|
||||
The first 31 characters have an accumulated ratio of 0.999656118741752.
|
||||
The first 4 characters have an accumulated ratio of 0.41512943556580745.
|
||||
All characters whose order is over 19 have an accumulated ratio of 0.038838485225387374.
|
||||
|
||||
866 sequences found.
|
||||
|
||||
First 465 (typical positive ratio): 0.9950412672131922
|
||||
Next 161 (626-465): 0.003964208227885013
|
||||
Rest: 0.0009945245589227936
|
||||
|
||||
- Processing end: 2022-12-14 23:59:16.758047
|
||||
257
script/BuildLangModelLogs/LangItalianModel.log
Normal file
257
script/BuildLangModelLogs/LangItalianModel.log
Normal file
@ -0,0 +1,257 @@
|
||||
= Logs of language model for Italian (it) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:00:41.412936
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Pieve Ligure (revision 130491730)
|
||||
Sant'Olcese (revision 130492210)
|
||||
Hippocampus hippocampus (revision 128855725)
|
||||
Camogli (revision 129723117)
|
||||
Saraceni (revision 130967096)
|
||||
Santo Stefano d'Aveto (revision 130492292)
|
||||
Rari Nantes Camogli (revision 130382864)
|
||||
Cogorno (revision 130488467)
|
||||
Rondanina (revision 130492038)
|
||||
Monte Fasce (revision 130194036)
|
||||
Papa Clemente XIII (revision 129913193)
|
||||
Fascia (Italia) (revision 130488511)
|
||||
Unione dei comuni della Valle del Tempo (revision 129948272)
|
||||
Neirone (revision 130524386)
|
||||
Targa d'immatricolazione (revision 128141509)
|
||||
Ostrya carpinifolia (revision 126337351)
|
||||
Grado giorno (revision 126566542)
|
||||
Lumarzo (revision 130491475)
|
||||
Mar Ligure (revision 130107542)
|
||||
Paladino (revision 129882952)
|
||||
Bologna (revision 130936185)
|
||||
Val Graveglia (revision 130327168)
|
||||
Cesena Football Club (revision 130936806)
|
||||
Castagno (revision 130959551)
|
||||
Sindaco (Italia) (revision 130620519)
|
||||
Serie A1 1992-1993 (pallanuoto maschile) (revision 123100560)
|
||||
Papa Benedetto VII (revision 127551579)
|
||||
Museo di Stato di San Marino (revision 129598251)
|
||||
Turismo (revision 130720212)
|
||||
Porifera (revision 130525526)
|
||||
Orcynopsis unicolor (revision 109105696)
|
||||
Altitudine (revision 129167670)
|
||||
Regno Unito (revision 130948019)
|
||||
Serie A 1977 (pallanuoto maschile) (revision 123084389)
|
||||
Santa Margherita Ligure (revision 130552435)
|
||||
Isole Canarie (revision 130471944)
|
||||
Fauna (revision 130513531)
|
||||
UTC+1 (revision 130216176)
|
||||
Repubblica Ligure (revision 130981553)
|
||||
Moneta (revision 129909120)
|
||||
Dipartimento di Genova (revision 123875710)
|
||||
Centro-destra (revision 129516922)
|
||||
Campomorone (revision 130488271)
|
||||
Pro Recco Waterpolo 1913 (revision 130855089)
|
||||
VIII secolo (revision 123734209)
|
||||
Codice postale (revision 129886472)
|
||||
Lorsica (revision 130491462)
|
||||
Oratorio di San Martino (Cogorno) (revision 121530764)
|
||||
Serie A1 2006-2007 (pallanuoto maschile) (revision 124739252)
|
||||
Toponimo (revision 122209204)
|
||||
Malta (revision 130935999)
|
||||
Conquista omayyade del Nord Africa (revision 127695759)
|
||||
Genovesato (revision 126177721)
|
||||
Anni 1990 (revision 130956165)
|
||||
Ottone (Italia) (revision 130545906)
|
||||
Serie A1 1989-1990 (pallanuoto maschile) (revision 123100527)
|
||||
Sarpa salpa (revision 127961617)
|
||||
Chiesa di San Michele Arcangelo (Fascia) (revision 125046519)
|
||||
Francia (revision 130948842)
|
||||
Bobbio (revision 130622556)
|
||||
Busalla (revision 130882212)
|
||||
Lista rossa IUCN (revision 130915943)
|
||||
Banu Tayy (revision 84116365)
|
||||
Marina Militare (Italia) (revision 130979101)
|
||||
Unione internazionale per la conservazione della natura (revision 130579833)
|
||||
Paolo Emilio Taviani (revision 130780337)
|
||||
Papa Leone I (revision 130618413)
|
||||
Canale (Fontanigorda) (revision 123574652)
|
||||
Squadra calcistica (revision 126640189)
|
||||
Canottieri Olona (revision 130332649)
|
||||
Italia (revision 130981754)
|
||||
Recco (revision 130550155)
|
||||
Posidonia oceanica (revision 127125148)
|
||||
Serie A1 1998-1999 (pallanuoto maschile) (revision 127919025)
|
||||
Chiesa di Santa Maria Assunta (Sant'Olcese) (revision 125048522)
|
||||
Divisione Nazionale A 1942 (revision 129237525)
|
||||
Linneo (revision 130638357)
|
||||
Ducato di Parma (revision 130172388)
|
||||
Agricoltura (revision 130664371)
|
||||
Abbazia territoriale di Monte Oliveto Maggiore (revision 130272809)
|
||||
Narcissus poeticus (revision 125041551)
|
||||
Montebruno (revision 130831738)
|
||||
Orazio (revision 130974052)
|
||||
Serie A 1969 (pallanuoto maschile) (revision 121285491)
|
||||
Province d'Italia (revision 130918567)
|
||||
Arenzano (revision 130948222)
|
||||
Hippocampus guttulatus (revision 128855778)
|
||||
Golfo di Guinea (revision 105877770)
|
||||
Poggio Favaro-San Bernardo (revision 125081247)
|
||||
Capoluogo (revision 130685749)
|
||||
Repubblica di Genova (revision 130768169)
|
||||
Pentema (revision 126141135)
|
||||
Val d'Aveto (revision 128737543)
|
||||
Congresso di Vienna (revision 130661388)
|
||||
Savignone (revision 130492313)
|
||||
Leccio (revision 130972482)
|
||||
Abbazia territoriale (revision 122408428)
|
||||
Popolazione (revision 130923533)
|
||||
Papa Celestino II (revision 124869649)
|
||||
Piccarello (revision 130492210)
|
||||
Monviso (revision 130330204)
|
||||
Congregazione del concilio (revision 130082114)
|
||||
Montoggio (revision 130491597)
|
||||
Guido Galletti (revision 125469744)
|
||||
Comuni della Liguria (revision 129469912)
|
||||
Genoa Cricket and Football Club Waterpolo (revision 121162887)
|
||||
Fieschi (revision 130660866)
|
||||
Propata (revision 130950866)
|
||||
Dominio (biologia) (revision 130125849)
|
||||
Valbrevenna (revision 130492851)
|
||||
Internet Archive (revision 130465900)
|
||||
Uscio (revision 128866468)
|
||||
Orologio (revision 130543447)
|
||||
Comune (Italia) (revision 130774715)
|
||||
Bertigaro (revision 122320383)
|
||||
Provincia di Genova (revision 129506405)
|
||||
Ophrys (revision 128266768)
|
||||
Codice catastale (revision 127142966)
|
||||
De Agostini (revision 130902849)
|
||||
Cristo degli abissi (revision 127736182)
|
||||
Bedonia (revision 129604507)
|
||||
Clima mediterraneo (revision 130419516)
|
||||
Testana (revision 119213804)
|
||||
Liguria (revision 130931853)
|
||||
Mignanego (revision 130491547)
|
||||
Cervino (revision 130958515)
|
||||
Specie (revision 129056674)
|
||||
Avegno (revision 130795555)
|
||||
Divisione Nazionale A 1940 (revision 130049982)
|
||||
Prefisso telefonico (revision 128985215)
|
||||
Legge ordinaria (revision 130685533)
|
||||
Comuni d'Italia (revision 127109732)
|
||||
Passo della Bocchetta (revision 127657640)
|
||||
Regno di Sardegna (1720-1861) (revision 130821005)
|
||||
Genova (revision 130975531)
|
||||
Densità di popolazione (revision 127833819)
|
||||
902 (revision 117897657)
|
||||
Dénia (revision 126955427)
|
||||
San Colombano Certenoli (revision 130492261)
|
||||
Giovanni Malalas (revision 125817717)
|
||||
Museo marinaro di Camogli (revision 122103975)
|
||||
Ludovico Ludovisi (revision 129913340)
|
||||
Mar Mediterraneo (revision 130743149)
|
||||
Castello di Torriglia (revision 125080889)
|
||||
Oratorio della Madonna del Suffragio (revision 120742009)
|
||||
Bestiame (revision 127582122)
|
||||
Stipola (revision 116665720)
|
||||
Benigembla (revision 112507275)
|
||||
1953 (revision 128662606)
|
||||
Gorreto (revision 128865878)
|
||||
Hong Kong (revision 130725302)
|
||||
Termine (diritto) (revision 92390137)
|
||||
Venezia (revision 130616026)
|
||||
Serie A 1957 (pallanuoto maschile) (revision 123074374)
|
||||
São Tomé e Príncipe (revision 130654782)
|
||||
Eudicotiledoni (revision 130190095)
|
||||
2001 (revision 130192341)
|
||||
Sandro Parcaroli (revision 130980950)
|
||||
Luigi XIV di Francia (revision 130911286)
|
||||
Emigrazione (revision 130905211)
|
||||
Palazzo Malvezzi De' Medici (revision 126221665)
|
||||
Lingua piemontese (revision 130765031)
|
||||
Marocco (revision 130983130)
|
||||
Zoagli (revision 130980639)
|
||||
Agrosistema (revision 128387197)
|
||||
Duilio Marcante (revision 130118457)
|
||||
Associazione Italiana Calciatori (revision 128956328)
|
||||
Serie A1 1999-2000 (pallanuoto maschile) (revision 123100630)
|
||||
Carasco (revision 130488282)
|
||||
Provincia di Savona (revision 130982095)
|
||||
Fuochi d'artificio (revision 128264351)
|
||||
Famiglia (tassonomia) (revision 125790574)
|
||||
Sedimento (revision 128753650)
|
||||
710 (revision 117897779)
|
||||
Città libere di Mentone e Roccabruna (revision 123049363)
|
||||
Stato di conservazione (biologia) (revision 130915940)
|
||||
Commenda di San Giovanni di Pré (revision 128755576)
|
||||
Savona (revision 130939330)
|
||||
Oratorio (architettura) (revision 130786628)
|
||||
Catholic Encyclopedia (revision 127585793)
|
||||
Sindaci di Genova (revision 130313202)
|
||||
Televisione in Italia (revision 130801206)
|
||||
1998 (revision 128802569)
|
||||
Roncaro (revision 129017868)
|
||||
Santuario di Nostra Signora della Guardia (revision 127664368)
|
||||
Pedogenesi (revision 130878265)
|
||||
Castiglione Chiavarese (revision 128865681)
|
||||
Penisola balcanica (revision 130823993)
|
||||
Tribogna (revision 130492831)
|
||||
Monocotiledoni (revision 127576318)
|
||||
1927 (revision 130983050)
|
||||
Serie A1 1987-1988 (pallanuoto maschile) (revision 123100512)
|
||||
Impero russo (revision 130957166)
|
||||
Telefonia mobile (revision 128938680)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:04:23.457483
|
||||
|
||||
65 characters appeared 2610063 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char i: 11.710598556433311 %
|
||||
[ 1] Char e: 11.295397850549968 %
|
||||
[ 2] Char a: 11.129424845300669 %
|
||||
[ 3] Char o: 8.864077227254667 %
|
||||
[ 4] Char n: 7.338175362050647 %
|
||||
[ 5] Char l: 7.110786214738877 %
|
||||
[ 6] Char t: 6.612599006230884 %
|
||||
[ 7] Char r: 6.366666245220901 %
|
||||
[ 8] Char s: 4.6817643865301335 %
|
||||
[ 9] Char c: 4.3799709049168545 %
|
||||
[10] Char d: 3.8855767083016772 %
|
||||
[11] Char u: 2.7626536217708155 %
|
||||
[12] Char p: 2.6831536250274417 %
|
||||
[13] Char m: 2.582198207476218 %
|
||||
[14] Char g: 1.94029799280707 %
|
||||
[15] Char v: 1.5553264423119288 %
|
||||
[16] Char f: 1.1005481476883892 %
|
||||
[17] Char b: 1.0104353802954182 %
|
||||
[18] Char z: 0.9865279113952421 %
|
||||
[19] Char h: 0.7462271983473195 %
|
||||
[20] Char q: 0.28060625356552693 %
|
||||
[21] Char à: 0.20631685901834554 %
|
||||
[22] Char è: 0.18731348630282105 %
|
||||
[23] Char x: 0.08524698445976209 %
|
||||
[24] Char ò: 0.08206698459002713 %
|
||||
[25] Char ù: 0.0803428882751106 %
|
||||
[26] Char k: 0.07789084018278486 %
|
||||
[27] Char y: 0.0714159006889872 %
|
||||
[28] Char w: 0.055094455574443986 %
|
||||
[29] Char é: 0.03704891414498424 %
|
||||
[30] Char j: 0.03390722752669188 %
|
||||
[31] Char ì: 0.033792287772364114 %
|
||||
[32] Char ë: 0.003601445635603432 %
|
||||
[33] Char ó: 0.0023371083379979717 %
|
||||
|
||||
The first 34 characters have an accumulated ratio of 0.9997938747072387.
|
||||
The first 4 characters have an accumulated ratio of 0.42999498479538617.
|
||||
All characters whose order is over 16 have an accumulated ratio of 0.03980172126113431.
|
||||
|
||||
1088 sequences found.
|
||||
|
||||
First 301 (typical positive ratio): 0.99504151171882
|
||||
Next 203 (504-301): 0.00396113186513658
|
||||
Rest: 0.0009973564160433712
|
||||
|
||||
- Processing end: 2022-12-15 00:04:23.551880
|
||||
1458
script/BuildLangModelLogs/LangKoreanModel.log
Normal file
1458
script/BuildLangModelLogs/LangKoreanModel.log
Normal file
File diff suppressed because it is too large
Load Diff
249
script/BuildLangModelLogs/LangLatvianModel.log
Normal file
249
script/BuildLangModelLogs/LangLatvianModel.log
Normal file
@ -0,0 +1,249 @@
|
||||
= Logs of language model for Latvian (lv) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:03:39.633493
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Zigfrīds Anna Meierovics (revision 3686782)
|
||||
Starptautiskā lidosta "Rīga" (revision 3707308)
|
||||
Antante (revision 3716536)
|
||||
1925. gads (revision 3701975)
|
||||
Roberts Lauris (revision 3680009)
|
||||
Ansis Rudevics (revision 2700953)
|
||||
Ludvigs Sēja (revision 3456955)
|
||||
Versaļas līgums (revision 3711385)
|
||||
Fricis Jansons (revision 3675061)
|
||||
Visvaldis Sanders (revision 3674433)
|
||||
Aleksandrs Millerāns (revision 3568131)
|
||||
Jānis Goldmanis (revision 3687216)
|
||||
Alfrēds Birznieks (revision 3720873)
|
||||
Vudro Vilsons (revision 3713822)
|
||||
16. augusts (revision 3644133)
|
||||
1919 (revision 3659820)
|
||||
26. jūlijs (revision 3670740)
|
||||
Hermanis Kaupiņš (revision 3674496)
|
||||
Kristīgā nacionālā savienība (revision 3285684)
|
||||
Žaks Širaks (revision 3568088)
|
||||
Ādolfs Bļodnieks (revision 3561111)
|
||||
Viktors Barkāns (revision 3674296)
|
||||
Ludvigs Ēķis (revision 3532610)
|
||||
Baldones pagasts (revision 3689283)
|
||||
Jezups Laurinovičs (revision 3675112)
|
||||
Jānis Taube (revision 3675135)
|
||||
Lozanna (revision 3688002)
|
||||
Mančestra (revision 3529214)
|
||||
1907 (revision 3699682)
|
||||
Vladislavs Rejmonts (revision 3422886)
|
||||
Barselona (revision 3582646)
|
||||
1922. gads (revision 3718937)
|
||||
Jelgava (revision 3737903)
|
||||
Virungas nacionālais parks (revision 2711819)
|
||||
Ženēva (revision 3457596)
|
||||
Vilhelms fon Firkss (revision 3674324)
|
||||
Ernests Morics (revision 3674395)
|
||||
Krievijas imperators (revision 3729614)
|
||||
Apvienotā Karaliste (revision 3711364)
|
||||
1940 (revision 3670135)
|
||||
Latgales Kristīgo zemnieku savienība (revision 3705204)
|
||||
Sanmarīno (revision 3696307)
|
||||
Kārlis Lībtāls (revision 3675115)
|
||||
Trešais reihs (revision 3635453)
|
||||
Belavia (revision 3211106)
|
||||
Andrejs Petrevics (revision 3674399)
|
||||
4. decembris (revision 3717520)
|
||||
Džūkstes pagasts (revision 3597691)
|
||||
Latvijas Centrālās padomes memorands (revision 3660681)
|
||||
Jezups Trasuns (revision 3674439)
|
||||
Rūdolfs Bēnuss (revision 3610547)
|
||||
Danciga (revision 3705481)
|
||||
Deivids Hafmens (revision 3658996)
|
||||
Luijs XIV (revision 3734500)
|
||||
Tautu savienība (revision 3435512)
|
||||
Reinzeme (revision 2852532)
|
||||
30. jūlijs (revision 3666331)
|
||||
Latgales Zemnieku partija (revision 3668873)
|
||||
Aleksandrs Jaunbērzs (revision 3686681)
|
||||
17. decembris (revision 3648254)
|
||||
Britu Indija (revision 3726666)
|
||||
Baltkrievija (revision 3737890)
|
||||
Pauls Mincs (revision 3297022)
|
||||
Fransuā Ollands (revision 3571635)
|
||||
Ernests Bauers (revision 3560912)
|
||||
1928 (revision 3668599)
|
||||
Vladimirs Presņakovs (deputāts) (revision 3675124)
|
||||
Fricis Menders (revision 3741179)
|
||||
Pēteris Ulpe (revision 3685565)
|
||||
Bunds (revision 3668855)
|
||||
Bezpartijiskais nacionālais centrs (revision 3731258)
|
||||
Kuba (revision 3703863)
|
||||
Jūlijs Ērglis (revision 3686692)
|
||||
Tartu Universitāte (revision 3615677)
|
||||
Taizeme (revision 3738637)
|
||||
Lietuva (revision 3705755)
|
||||
Egons Knops (revision 3545557)
|
||||
2. aprīlis (revision 3645692)
|
||||
Virtuālā starptautiskā autoritatīvā datne (revision 3465437)
|
||||
Antons Dzenis (revision 3662152)
|
||||
Eduards Jaunzems (revision 3724684)
|
||||
14. oktobris (revision 3706344)
|
||||
Nacionālsociālisms (revision 3736342)
|
||||
1932 (revision 3702844)
|
||||
Kristaps Bungšs (revision 3674315)
|
||||
19. novembris (revision 3678574)
|
||||
17. marts (revision 3729568)
|
||||
Marģers Skujenieks (revision 3560686)
|
||||
1924. gads (revision 3723738)
|
||||
Austrumprūsija (revision 3683654)
|
||||
Vācbaltu reformu partija (revision 2783453)
|
||||
Diseldorfa (revision 3720512)
|
||||
Poznaņa (revision 3626800)
|
||||
Pols Reno (revision 3568101)
|
||||
Gustavs Reinhards (revision 3674422)
|
||||
Bordo (revision 3662671)
|
||||
Amerikas Savienotās Valstis (revision 3738460)
|
||||
Broņislavs Kudeiko (revision 3544882)
|
||||
Latvijas Republikas satiksmes ministrs (revision 3720427)
|
||||
10. septembris (revision 3676364)
|
||||
Deivids Loids Džordžs (revision 3665955)
|
||||
Kārlis Dēķens (revision 3544680)
|
||||
Pilskalnes lidlauks (revision 3541876)
|
||||
Gustavs Zemgals (revision 3616315)
|
||||
Ceire-Cion (revision 3668866)
|
||||
Jānis Višņa (revision 3685546)
|
||||
Armāns Faljērs (revision 3710557)
|
||||
Valmieras apriņķis (revision 3645722)
|
||||
Nacionālā apvienība (pirmskara) (revision 3731258)
|
||||
Vadživaruds (revision 3602806)
|
||||
Pomerānija (revision 3287651)
|
||||
PSRS (revision 3739451)
|
||||
Lielbritānija (revision 3711364)
|
||||
Jānis Pauļuks (politiķis) (revision 3706260)
|
||||
Jānis Lībietis (ārsts) (revision 3559846)
|
||||
Miķelis Bružis (revision 3675043)
|
||||
2008. gads (revision 3738998)
|
||||
25. jūnijs (revision 3583477)
|
||||
Pēteris Juraševskis (revision 3695647)
|
||||
Voldemārs Bastjānis (revision 3716209)
|
||||
Eduards Miķelsons (revision 2928933)
|
||||
Jānis Birznieks (revision 3560607)
|
||||
Indiāna (revision 3591725)
|
||||
Frīdrihs Vesmanis (revision 3695649)
|
||||
Vēsture (revision 3693386)
|
||||
Klāra Kalniņa (revision 3739312)
|
||||
1858. gads (revision 3682953)
|
||||
Rēzeknes Piena konservu kombināts (revision 3684434)
|
||||
Eduards Radziņš (revision 3559849)
|
||||
Jānis Mazvērsītis (revision 3674394)
|
||||
Jakovs Helmanis (revision 3675057)
|
||||
Iraklija (revision 3507811)
|
||||
Zelma Cēsniece-Freidenfelde (revision 3675045)
|
||||
Beļģija (revision 3725424)
|
||||
4. Vidzemes latviešu strēlnieku pulks (revision 3658761)
|
||||
Rūdolfs Lindiņš (revision 3674388)
|
||||
1941. gads (revision 3682960)
|
||||
28. oktobris (revision 3638975)
|
||||
Fricis Venevics (revision 3675314)
|
||||
7. jūlijs (revision 3634301)
|
||||
1894. gads Latvijā (revision 3720831)
|
||||
Slampes lidlauks (revision 2938226)
|
||||
Hermanis Salnis (revision 3716194)
|
||||
Ansis Buševics (revision 3683798)
|
||||
Aleksandrs Neibergs (deputāts) (revision 3675118)
|
||||
Dominiks de Vilpēns (revision 3679792)
|
||||
1. jūnijs (revision 3630863)
|
||||
Augusts Kirhenšteins (revision 3662841)
|
||||
Simbols (revision 3599652)
|
||||
18. decembris (revision 3627307)
|
||||
Latvijas Satversmes sapulce (revision 3546097)
|
||||
Hermanis Enzeliņš (revision 3560600)
|
||||
Latvijas Poļu savienība (revision 3668852)
|
||||
Kazahstāna (revision 3688816)
|
||||
Kasimovas hans (revision 3729732)
|
||||
1917 (revision 3698547)
|
||||
Inčukalna pagasts (revision 3691272)
|
||||
Īzaks Baševiss Zingers (revision 3555487)
|
||||
Kārlis Plāters (revision 3678129)
|
||||
Voldemārs Pusulls (revision 3674821)
|
||||
Elektrotehnika (revision 3595249)
|
||||
2. jūnijs (revision 3648119)
|
||||
1186 (revision 2396057)
|
||||
Latvieši (revision 3729937)
|
||||
Eduards Tomass (revision 3675137)
|
||||
Jānis Goliass (revision 3675054)
|
||||
Jaltas konference (revision 3445052)
|
||||
5. janvāris (revision 3676367)
|
||||
Īzaks Berss (revision 3548498)
|
||||
Vinstons Čērčils (revision 3714664)
|
||||
Fricis Deglavs (revision 3733737)
|
||||
Staņislavs Ozoliņš (revision 3675119)
|
||||
1492 (revision 2591769)
|
||||
22. maijs (revision 3579659)
|
||||
Ernests Miezis (revision 3674633)
|
||||
Fjodors Pavlovs (revision 3675121)
|
||||
1944 (revision 3649960)
|
||||
Trešdiena (revision 3537122)
|
||||
Teodors Grīnbergs (revision 3735023)
|
||||
11. augusts (revision 3670138)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:06:43.949536
|
||||
|
||||
72 characters appeared 681085 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 11.728785687542672 %
|
||||
[ 1] Char i: 9.616420857895857 %
|
||||
[ 2] Char s: 8.86761564268777 %
|
||||
[ 3] Char e: 6.2965709125880025 %
|
||||
[ 4] Char r: 5.789585734526527 %
|
||||
[ 5] Char t: 5.762716841510238 %
|
||||
[ 6] Char u: 4.638187597730092 %
|
||||
[ 7] Char n: 4.497236027808571 %
|
||||
[ 8] Char ā: 3.8918783999060325 %
|
||||
[ 9] Char o: 3.768398951672699 %
|
||||
[10] Char l: 3.7578275839285844 %
|
||||
[11] Char k: 3.570185806470558 %
|
||||
[12] Char d: 3.432464376693071 %
|
||||
[13] Char m: 3.1982792162505413 %
|
||||
[14] Char v: 2.785995874230089 %
|
||||
[15] Char p: 2.7808570149100333 %
|
||||
[16] Char j: 2.6051080261641353 %
|
||||
[17] Char g: 1.8922748261964366 %
|
||||
[18] Char b: 1.808878480659536 %
|
||||
[19] Char ī: 1.5877607053451477 %
|
||||
[20] Char z: 1.5134674820323455 %
|
||||
[21] Char ē: 1.4192061196473273 %
|
||||
[22] Char c: 1.2908814611979413 %
|
||||
[23] Char š: 0.7927057562565611 %
|
||||
[24] Char f: 0.45970767231696485 %
|
||||
[25] Char ņ: 0.38688269452417834 %
|
||||
[26] Char ļ: 0.35869238053987385 %
|
||||
[27] Char h: 0.3382837678116535 %
|
||||
[28] Char ū: 0.33152983841958056 %
|
||||
[29] Char ž: 0.2300740729864848 %
|
||||
[30] Char ķ: 0.14638407834558095 %
|
||||
[31] Char ģ: 0.13287621956143508 %
|
||||
[32] Char č: 0.12406674644133991 %
|
||||
[33] Char y: 0.06078536452865649 %
|
||||
[34] Char w: 0.04874575126452645 %
|
||||
[35] Char x: 0.0226109810082442 %
|
||||
[36] Char é: 0.009690420432104656 %
|
||||
[37] Char æ: 0.007634876704082457 %
|
||||
[38] Char ü: 0.0063134557360681855 %
|
||||
[39] Char q: 0.005872982080063428 %
|
||||
|
||||
The first 40 characters have an accumulated ratio of 0.9996344068655164.
|
||||
The first 5 characters have an accumulated ratio of 0.42298978835240825.
|
||||
All characters whose order is over 22 have an accumulated ratio of 0.034628570589573984.
|
||||
|
||||
1225 sequences found.
|
||||
|
||||
First 641 (typical positive ratio): 0.9950084809045374
|
||||
Next 222 (863-641): 0.003991829224555987
|
||||
Rest: 0.0009996898709065949
|
||||
|
||||
- Processing end: 2022-12-15 00:06:44.066079
|
||||
238
script/BuildLangModelLogs/LangLithuanianModel.log
Normal file
238
script/BuildLangModelLogs/LangLithuanianModel.log
Normal file
@ -0,0 +1,238 @@
|
||||
= Logs of language model for Lithuanian (lt) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:01:46.650823
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Karūna (laivas) (revision 6615331)
|
||||
Švedų kalba (revision 6700452)
|
||||
1928 (revision 6669868)
|
||||
Flagmanas (laivas) (revision 6759598)
|
||||
Varytuvas (revision 6786126)
|
||||
XVII a. (revision 6787652)
|
||||
Liepos 1 d. (revision 6754085)
|
||||
Rugpjūčio 10 (revision 6617623)
|
||||
Grimzlė (revision 6761216)
|
||||
1664 (revision 5301833)
|
||||
Vaza (laivas) (revision 6497236)
|
||||
1665 (revision 5301834)
|
||||
Rišeljė (revision 5833341)
|
||||
Adolfas Krogertas (revision 6565346)
|
||||
Aviacija (revision 6811479)
|
||||
1973 (revision 6610221)
|
||||
Keliamieji metai, prasidedantys antradienį (revision 6766001)
|
||||
Rugpjūčio 24 (revision 6423702)
|
||||
1991 (revision 6150102)
|
||||
T (revision 6783088)
|
||||
Banginiai (revision 6804052)
|
||||
Vaidas Matonis (revision 6565973)
|
||||
Poetas (revision 6403284)
|
||||
Prancūzija (revision 6643111)
|
||||
Afrikanų kalba (revision 6152685)
|
||||
Tuniso vyrų futbolo rinktinė (revision 6801286)
|
||||
1666 m. (revision 5301835)
|
||||
1895 (revision 6150013)
|
||||
Alfonsas Petrulis (revision 6513426)
|
||||
XVI amžius (revision 6787653)
|
||||
Lietuvos Didžioji Kunigaikštystė (revision 6754017)
|
||||
1601 (revision 5301713)
|
||||
Progresyvusis metalas (revision 6777851)
|
||||
Ispanijos karalius (revision 5577241)
|
||||
1964 (revision 6342494)
|
||||
Geologija (revision 6555589)
|
||||
Alfonsas VI (revision 6790669)
|
||||
Rumunija (revision 6752868)
|
||||
Centrinės vokiečių tarmės (revision 6658980)
|
||||
Berberų kalendorius (revision 6286996)
|
||||
Paukščiai (revision 6681188)
|
||||
Prancūzijos revoliucija (revision 6730590)
|
||||
XVII amžiaus 7-as dešimtmetis (revision 6787652)
|
||||
XVII amžiaus 1-as dešimtmetis (revision 6787652)
|
||||
Patranka (revision 6775738)
|
||||
Barokas (revision 6799370)
|
||||
Saka kalendorius (revision 6109866)
|
||||
Anglų kalba (revision 6812014)
|
||||
Aleksandras Bogdanovas (revision 6790602)
|
||||
Grammy apdovanojimas (revision 4864126)
|
||||
Olandų kalba (revision 6532580)
|
||||
1607 (revision 5301718)
|
||||
Metai (revision 5765072)
|
||||
Variklis (revision 6143447)
|
||||
Bahajų kalendorius (revision 6367987)
|
||||
R (revision 6778284)
|
||||
Bavarų tarmė (revision 6040251)
|
||||
Šiaurės fryzų kalba (revision 6573722)
|
||||
Liepos 8 (revision 6664845)
|
||||
1994 (revision 6586230)
|
||||
1615 (revision 5687980)
|
||||
2 tūkstantmetis (revision 6756525)
|
||||
Genovaitė Jasiulienė (revision 4953583)
|
||||
1702 (revision 6614915)
|
||||
Kinų kalendorius (revision 6766373)
|
||||
Citoskeletas (revision 6794578)
|
||||
Kanapė (revision 6765202)
|
||||
Ruoniniai (revision 6802226)
|
||||
Laivagraužiai (revision 6768667)
|
||||
1674 (revision 5301878)
|
||||
1627 (revision 6421529)
|
||||
XVIII amžius (revision 6734712)
|
||||
1663 m. (revision 5472094)
|
||||
Korėjiečių kalendorius (revision 6767341)
|
||||
Viljamas Šekspyras (revision 6705872)
|
||||
Tajų saulės kalendorius (revision 4705906)
|
||||
Jonušas Radvila (revision 6568066)
|
||||
1976 (revision 6614657)
|
||||
1997 m. Nobelio premijos laureatai (revision 6354150)
|
||||
1921 (revision 6727523)
|
||||
J (revision 6763325)
|
||||
Sniegaeigis (revision 4729221)
|
||||
TSKP CK Politbiuras (revision 6615809)
|
||||
Blakstienotosios kirmėlės (revision 6757660)
|
||||
Šikšnosparniai (revision 6215031)
|
||||
Pomeranija (revision 6777309)
|
||||
Galeonas (revision 6760086)
|
||||
NBA (revision 6805735)
|
||||
1635 (revision 5301779)
|
||||
1580 (revision 5301668)
|
||||
Jurgis Bialozoras (revision 5481171)
|
||||
Adobe Flash (revision 6245137)
|
||||
1628 (revision 5301771)
|
||||
Vilniaus universitetas (revision 6587086)
|
||||
Švedija (revision 6705415)
|
||||
Lapkričio 18 (revision 6494116)
|
||||
Budistų kalendorius (revision 6793741)
|
||||
1961 (revision 6585934)
|
||||
Traktorius (revision 6709383)
|
||||
1852 (revision 6587281)
|
||||
2010 (revision 6604084)
|
||||
Chosė Benitas de Čurigera (revision 6764083)
|
||||
Kovo 18 (revision 6466532)
|
||||
1577 m. (revision 6091965)
|
||||
Lenkijos karalystė (revision 6756257)
|
||||
Angolos prezidentas (revision 5789584)
|
||||
Nekeliamieji metai, prasidedantys pirmadienį (revision 6773389)
|
||||
Sausio 25 (revision 6755589)
|
||||
Nobelio taikos premija (revision 6710284)
|
||||
Rusijos imperijos Valstybės Dūma (revision 6615725)
|
||||
Kristupas Dorohostaiskis (revision 6419193)
|
||||
Eurelijus Žukauskas (revision 6175174)
|
||||
Charles Gounod (revision 6491765)
|
||||
2007 (revision 6266252)
|
||||
Kiaušialąstė (revision 6766209)
|
||||
Kalendoriaus era (revision 6764948)
|
||||
Lietuvos kariuomenė (revision 6799446)
|
||||
Lapkričio 26 (revision 6805301)
|
||||
Kaukazas (revision 6215089)
|
||||
Austrijos istorija (revision 6682730)
|
||||
1956 m. Nobelio premijos laureatai (revision 6354068)
|
||||
Klasė (biologija) (revision 6811932)
|
||||
1633 (revision 6614909)
|
||||
Rytų fryzų kalba (revision 6038842)
|
||||
Rugsėjo 14 (revision 6729628)
|
||||
Ferdinandas IV (Kastilija) (revision 6759408)
|
||||
Rumi kalendorius (revision 6779509)
|
||||
XIV amžiaus 1-as dešimtmetis (revision 5748254)
|
||||
Kiurasao (revision 6268834)
|
||||
Kompozitorius (revision 6812678)
|
||||
1554 (revision 5301618)
|
||||
LDI (revision 6565867)
|
||||
Abraomas Vaina (revision 5225996)
|
||||
Akis (revision 5782074)
|
||||
Automobilis (revision 6503408)
|
||||
1975 (revision 6803278)
|
||||
Pluoštas (revision 5160560)
|
||||
1517 (revision 6221041)
|
||||
Fryzų kalbos (revision 6702541)
|
||||
Čučhės kalendorius (revision 5134916)
|
||||
Koptų kalendorius (revision 6767212)
|
||||
1659 (revision 6421510)
|
||||
Respublika (revision 6367313)
|
||||
Ṯ (revision 6789824)
|
||||
Senatas (revision 6454019)
|
||||
XVII amžiaus 9-as dešimtmetis (revision 6787652)
|
||||
1968 (revision 6646708)
|
||||
TVF (revision 6644956)
|
||||
Gegužės 15 (revision 6727491)
|
||||
Rūta Mikelkevičiūtė (revision 5312087)
|
||||
Communic (revision 4365658)
|
||||
XI amžiaus 5-as dešimtmetis (revision 6787645)
|
||||
1636 (revision 5301780)
|
||||
FIBA Okeanija (revision 6251013)
|
||||
Afrikos Tautų taurė (revision 6482596)
|
||||
Pilypas II Habsburgas (revision 6727186)
|
||||
Birželio 11 (revision 6617470)
|
||||
Balandžio 13 (revision 6730671)
|
||||
2333 m. pr. m. e. (revision 6615993)
|
||||
Plūgas (revision 6777058)
|
||||
1982 (revision 6681768)
|
||||
Senoji norvegų kalba (revision 6340813)
|
||||
Birželio 5 (revision 6806306)
|
||||
1675 m. (revision 5301879)
|
||||
XVII amžiaus 2-as dešimtmetis (revision 6787652)
|
||||
2005 (revision 6092829)
|
||||
1966 (revision 6691317)
|
||||
Lietuvos Didžiosios Kunigaikštystės kariuomenė (revision 6769403)
|
||||
Aukštakaktis snapuotis (revision 6791901)
|
||||
Visuotinė lietuvių enciklopedija (revision 6600066)
|
||||
Senovės indų kalendoriai (revision 6780792)
|
||||
Šiaurės Fryzija (revision 6788630)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:05:02.242918
|
||||
|
||||
67 characters appeared 840839 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char i: 12.818625206490184 %
|
||||
[ 1] Char a: 10.979628680401362 %
|
||||
[ 2] Char s: 8.576790562759339 %
|
||||
[ 3] Char o: 6.9993185377937985 %
|
||||
[ 4] Char r: 5.686582092410081 %
|
||||
[ 5] Char e: 5.511399923171974 %
|
||||
[ 6] Char n: 5.251421496862062 %
|
||||
[ 7] Char t: 5.147477697870817 %
|
||||
[ 8] Char u: 4.48516303358907 %
|
||||
[ 9] Char k: 4.023719166213747 %
|
||||
[10] Char l: 3.7915700865445108 %
|
||||
[11] Char m: 3.3342887282821088 %
|
||||
[12] Char d: 2.7928057571068896 %
|
||||
[13] Char v: 2.7183563084014892 %
|
||||
[14] Char p: 2.481925790787535 %
|
||||
[15] Char j: 2.3496769298284215 %
|
||||
[16] Char g: 1.9117809711490548 %
|
||||
[17] Char ė: 1.6236164117030727 %
|
||||
[18] Char b: 1.519791541543625 %
|
||||
[19] Char y: 1.2309134091068563 %
|
||||
[20] Char ų: 1.1750168581619074 %
|
||||
[21] Char š: 0.9344238314350309 %
|
||||
[22] Char ž: 0.7738699085080497 %
|
||||
[23] Char c: 0.7027504670929868 %
|
||||
[24] Char z: 0.464417088170268 %
|
||||
[25] Char č: 0.4489563400365587 %
|
||||
[26] Char f: 0.4287384386309389 %
|
||||
[27] Char ū: 0.40661767591655473 %
|
||||
[28] Char ą: 0.3877079916607103 %
|
||||
[29] Char h: 0.3816426212390243 %
|
||||
[30] Char į: 0.3095717491695794 %
|
||||
[31] Char ę: 0.16340821489012758 %
|
||||
[32] Char x: 0.08253660926764815 %
|
||||
[33] Char w: 0.04733367505551003 %
|
||||
[34] Char é: 0.012130740843371917 %
|
||||
[35] Char ö: 0.009871093039214403 %
|
||||
[36] Char q: 0.008325018225843474 %
|
||||
|
||||
The first 37 characters have an accumulated ratio of 0.9997217065335932.
|
||||
The first 5 characters have an accumulated ratio of 0.45060945079854764.
|
||||
All characters whose order is over 23 have an accumulated ratio of 0.0315125725614535.
|
||||
|
||||
1159 sequences found.
|
||||
|
||||
First 556 (typical positive ratio): 0.9950187236479393
|
||||
Next 217 (773-556): 0.003987547510905687
|
||||
Rest: 0.000993728841155006
|
||||
|
||||
- Processing end: 2022-12-15 00:05:02.338906
|
||||
248
script/BuildLangModelLogs/LangMacedonianModel.log
Normal file
248
script/BuildLangModelLogs/LangMacedonianModel.log
Normal file
@ -0,0 +1,248 @@
|
||||
= Logs of language model for Macedonian (mk) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-17 22:01:17.484142
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Хелсинки (revision 4901169)
|
||||
Западен Берлин (revision 4609007)
|
||||
Средна Европа (revision 4658727)
|
||||
Украина (revision 4859969)
|
||||
Земји членки на Европската Унија (revision 3925804)
|
||||
Кипар (revision 4898295)
|
||||
Метрополитенска област (revision 4601372)
|
||||
Вајмар (revision 4642566)
|
||||
Гернзи (revision 4849858)
|
||||
Летни олимписки игри 2008 (revision 4904330)
|
||||
Кина (revision 4874294)
|
||||
Пекинг (revision 4898517)
|
||||
Бразил (revision 4887064)
|
||||
Нормативна контрола (revision 4647772)
|
||||
Пном Пен (revision 4655657)
|
||||
Малта (revision 4859950)
|
||||
Паритет на куповна моќ (revision 4885746)
|
||||
Обединета нормативна податотека (revision 4624340)
|
||||
ФИБА Европа (revision 4888619)
|
||||
Белгија (revision 4881095)
|
||||
Европски суд за човекови права (revision 4872819)
|
||||
GMT (revision 4857360)
|
||||
Јохан Волфганг фон Гете (revision 4847592)
|
||||
Лондон (revision 4820446)
|
||||
САД (revision 4887829)
|
||||
Хонгконг (revision 4644474)
|
||||
Регион (revision 4440392)
|
||||
Шангај (revision 4829926)
|
||||
Холандија (revision 4859982)
|
||||
Национална библиотека на Австралија (revision 4821571)
|
||||
Сантјаго де Компостела (revision 4790447)
|
||||
В’лтава (revision 4480493)
|
||||
Печ (revision 4836968)
|
||||
Литванија (revision 4859985)
|
||||
Централна Африка (revision 4880126)
|
||||
1808 (revision 4205012)
|
||||
Втора светска војна (revision 4878249)
|
||||
Сувереност (revision 4847447)
|
||||
Општествено уредување (revision 4562058)
|
||||
Француска Гвајана (revision 4658818)
|
||||
Бразавил (revision 4833032)
|
||||
Англија (revision 4831557)
|
||||
Сомалија (revision 4826045)
|
||||
Собрание (revision 4721533)
|
||||
Поштенски број (revision 4890825)
|
||||
ISO 4217 (revision 4900097)
|
||||
Никозија (revision 4821748)
|
||||
Исток (revision 4575999)
|
||||
Европа (revision 4898183)
|
||||
Азија (revision 4879497)
|
||||
Романија (revision 4880087)
|
||||
Република Кина (revision 4859932)
|
||||
Музејски Остров (revision 4642577)
|
||||
Ченгду (revision 4838944)
|
||||
Саудиска Арабија (revision 4904971)
|
||||
Шри Ланка (revision 4829991)
|
||||
Соединети Американски Држави (revision 4887829)
|
||||
Питкерн (revision 4879701)
|
||||
Берлин (revision 4898023)
|
||||
Турција (revision 4898742)
|
||||
Зимски олимписки игри 2022 (revision 4879017)
|
||||
Страна на возење (revision 4883774)
|
||||
Национална библиотека на Франција (revision 4859687)
|
||||
Статуа на Исус Христос Искупителот (revision 4781328)
|
||||
Грција (revision 4883904)
|
||||
Британска Индоокеанска Територија (revision 4847444)
|
||||
Германија (revision 4898116)
|
||||
Унгарија (revision 4859996)
|
||||
Список на земјите по највисок државен домен (revision 4660191)
|
||||
Естонија (revision 4904061)
|
||||
Ризница (Викимедија) (revision 4605630)
|
||||
5 мај (revision 4286017)
|
||||
Ријад (revision 4825472)
|
||||
Баптизам (revision 4893797)
|
||||
Индокина (revision 3860492)
|
||||
ГДР (revision 4804373)
|
||||
Черкаска Област (revision 4586879)
|
||||
Данска (revision 4860001)
|
||||
Источен Берлин (revision 4847439)
|
||||
Парагвај (revision 4823976)
|
||||
Студена војна (revision 4873005)
|
||||
Лихтенштајн (revision 4859989)
|
||||
Србија (revision 4888612)
|
||||
1933 (revision 4205151)
|
||||
Монголски јазик (revision 4822543)
|
||||
Виртуелна меѓународна нормативна податотека (revision 4063132)
|
||||
Европска Унија (revision 4878852)
|
||||
Трет Рајх (revision 4873021)
|
||||
Шведска (revision 4859974)
|
||||
Тириншка Шума (revision 4620246)
|
||||
Стрелаштво на Летните олимписки игри - 2008 (revision 4854954)
|
||||
Караимски јазик (revision 4578663)
|
||||
Бруто-домашен производ (revision 4839401)
|
||||
Западна Европа (revision 4795691)
|
||||
Туркиски јазици (revision 4811424)
|
||||
Зимски олимписки игри 2006 (revision 4279937)
|
||||
Список на држави и територии по површина (revision 4880407)
|
||||
Венецијанска Република (revision 4872002)
|
||||
Монголија (revision 4859944)
|
||||
Држава Палестина (revision 4898178)
|
||||
Список на земји (revision 4818847)
|
||||
1821 (revision 4205027)
|
||||
Контролен број на Конгресната библиотека (revision 4500225)
|
||||
Јунан (revision 4609778)
|
||||
Гибралтар (revision 4849866)
|
||||
Тихи Океан (revision 4898720)
|
||||
Брисел (revision 4816384)
|
||||
Острава (revision 4648232)
|
||||
Хрватска (revision 4859986)
|
||||
Авганистан (revision 4897935)
|
||||
Обединетото Кралство (revision 4878275)
|
||||
Везер (revision 4611529)
|
||||
Марлен Дитрих (revision 4898384)
|
||||
Фарски Острови (revision 4828720)
|
||||
Илм (округ) (revision 4622799)
|
||||
Пакистан (revision 4893644)
|
||||
Пафос (revision 4540073)
|
||||
Москва (revision 4836476)
|
||||
Океанија (revision 4847323)
|
||||
Франција (revision 4859997)
|
||||
Брно (revision 4654440)
|
||||
Премиер на Кина (revision 4365963)
|
||||
Тибетско писмо (revision 4855132)
|
||||
Рајхстаг (revision 4748712)
|
||||
Географија (revision 4593743)
|
||||
Бенгалски календар (revision 4467317)
|
||||
Соединетите Држави (revision 4887829)
|
||||
Список на држави (revision 4818847)
|
||||
Запорошка област (revision 4795592)
|
||||
Литвански (revision 4847113)
|
||||
ОБСЕ (revision 4751462)
|
||||
Молиер (revision 4834680)
|
||||
Моравскошлески крај (revision 4203476)
|
||||
Португалија (revision 4859979)
|
||||
Зимски олимписки игри 1948 (revision 4750285)
|
||||
1938 (revision 4444155)
|
||||
Град во Парагвај (revision 4530019)
|
||||
Норвешка (revision 4859981)
|
||||
Државно знаме (revision 3360721)
|
||||
Уганда (revision 4828524)
|
||||
Калај (revision 4901898)
|
||||
Национална библиотека на Чешка (revision 4859689)
|
||||
Кувајт (revision 4859952)
|
||||
Национална парламентарна библиотека (Јапонија) (revision 4821574)
|
||||
Сеул (revision 4837760)
|
||||
Авторитаризам (revision 4763980)
|
||||
Чисто писмо (revision 4648379)
|
||||
УНЕСКО (revision 4768869)
|
||||
Кампала (revision 4724511)
|
||||
Монголи (revision 4821043)
|
||||
Валдовињо (revision 4555459)
|
||||
Стреличарство на Летните олимписки игри 2016 (revision 4827288)
|
||||
Сеута (revision 4529464)
|
||||
Тоуро (revision 4555813)
|
||||
1880 (revision 4485297)
|
||||
Кинески Тајпеј (revision 4859932)
|
||||
Азербејџан (revision 4897943)
|
||||
Источен Тимор (revision 4859940)
|
||||
Меѓународен олимписки комитет (revision 4585376)
|
||||
Обединето Кралство (revision 4878275)
|
||||
Ежен Делакроа (revision 4850741)
|
||||
Квадратен километар (revision 4177969)
|
||||
Бронзено време (revision 4687506)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-17 22:04:50.749722
|
||||
|
||||
71 characters appeared 1512742 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char а: 13.171909023481861 %
|
||||
[ 1] Char о: 10.210531604199527 %
|
||||
[ 2] Char и: 9.815156847631652 %
|
||||
[ 3] Char е: 7.806089868596231 %
|
||||
[ 4] Char н: 7.393263358854318 %
|
||||
[ 5] Char т: 6.73862429945093 %
|
||||
[ 6] Char р: 5.301366657367879 %
|
||||
[ 7] Char с: 4.862098097362273 %
|
||||
[ 8] Char в: 4.178372782668823 %
|
||||
[ 9] Char к: 3.809307866113323 %
|
||||
[10] Char д: 3.5743702495204075 %
|
||||
[11] Char л: 3.571263308614423 %
|
||||
[12] Char п: 2.6521376414484426 %
|
||||
[13] Char м: 2.191384915603586 %
|
||||
[14] Char у: 2.13499724341626 %
|
||||
[15] Char ј: 1.9410448047320692 %
|
||||
[16] Char г: 1.8507452030815565 %
|
||||
[17] Char з: 1.656858869523025 %
|
||||
[18] Char б: 1.3595180143077934 %
|
||||
[19] Char ц: 0.8072757945505579 %
|
||||
[20] Char ч: 0.6892120401231671 %
|
||||
[21] Char ш: 0.6130589353637302 %
|
||||
[22] Char ж: 0.44072287277010885 %
|
||||
[23] Char ф: 0.3884998234993145 %
|
||||
[24] Char х: 0.27631942525559544 %
|
||||
[25] Char њ: 0.24419233418520805 %
|
||||
[26] Char e: 0.19844758722901856 %
|
||||
[27] Char i: 0.19097770802952518 %
|
||||
[28] Char a: 0.17398869073510223 %
|
||||
[29] Char ќ: 0.16175924248814405 %
|
||||
[30] Char n: 0.14477022519372107 %
|
||||
[31] Char ѓ: 0.14113444328246325 %
|
||||
[32] Char r: 0.13577992810406533 %
|
||||
[33] Char s: 0.12645910538611344 %
|
||||
[34] Char t: 0.1238149003597441 %
|
||||
[35] Char o: 0.11032945472526048 %
|
||||
[36] Char l: 0.08944023501694275 %
|
||||
[37] Char c: 0.08203646094310861 %
|
||||
[38] Char u: 0.0737072151100452 %
|
||||
[39] Char d: 0.06313039500456787 %
|
||||
[40] Char m: 0.0618082924913832 %
|
||||
[41] Char h: 0.055726620930733724 %
|
||||
[42] Char џ: 0.054470623543208294 %
|
||||
[43] Char g: 0.051165367260246626 %
|
||||
[44] Char b: 0.043232752181138624 %
|
||||
[45] Char p: 0.04078686253174699 %
|
||||
[46] Char f: 0.030540568054565814 %
|
||||
[47] Char k: 0.028160783530833414 %
|
||||
[48] Char v: 0.02730141689726338 %
|
||||
[49] Char y: 0.025847104132760243 %
|
||||
[50] Char w: 0.02201300684452471 %
|
||||
[51] Char x: 0.021219745336613912 %
|
||||
[52] Char j: 0.009519138094929606 %
|
||||
[53] Char z: 0.00786650995344877 %
|
||||
[54] Char љ: 0.007403774073834138 %
|
||||
[55] Char ѕ: 0.00489177929878327 %
|
||||
|
||||
The first 56 characters have an accumulated ratio of 0.9998605181848591.
|
||||
The first 4 characters have an accumulated ratio of 0.41003687343909273.
|
||||
All characters whose order is over 22 have an accumulated ratio of 0.03216741519703955.
|
||||
|
||||
1405 sequences found.
|
||||
|
||||
First 613 (typical positive ratio): 0.9950204964819953
|
||||
Next 273 (886-613): 0.003979891583654749
|
||||
Rest: 0.0009996119343499421
|
||||
|
||||
- Processing end: 2022-12-17 22:04:50.898793
|
||||
173
script/BuildLangModelLogs/LangMalteseModel.log
Normal file
173
script/BuildLangModelLogs/LangMalteseModel.log
Normal file
@ -0,0 +1,173 @@
|
||||
= Logs of language model for Maltese (mt) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:01:02.004586
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Unjoni Ewropea (revision 279041)
|
||||
Pajjiżi l-Baxxi (revision 281883)
|
||||
Greċja (revision 266497)
|
||||
Djalett (revision 269151)
|
||||
Lista ta' pajjiżi skont id-densità ta' popolazzjoni (revision 272026)
|
||||
Ċipru (revision 281387)
|
||||
Abkażja (revision 266550)
|
||||
Żvizzera (revision 268804)
|
||||
Portugall (revision 279398)
|
||||
Każakistan (revision 280059)
|
||||
Repubblika tal-Maċedonja ta' Fuq (revision 281602)
|
||||
Netherlands (revision 281883)
|
||||
Lista ta' pajjiżi skont l-erja (revision 260621)
|
||||
Żvezja (revision 282136)
|
||||
Ewro (revision 283047)
|
||||
Indoneżja (revision 279426)
|
||||
Monaco (revision 281097)
|
||||
Ċekja (revision 279325)
|
||||
Awstrija (revision 273952)
|
||||
It-Tieni Gwerra Dinjija (revision 273105)
|
||||
Bulgarija (revision 266495)
|
||||
Belarussja (revision 270102)
|
||||
Arabja Sawdija (revision 271085)
|
||||
Montenegro (revision 279276)
|
||||
Iżlanda (revision 280133)
|
||||
Kirġiżistan (revision 272019)
|
||||
Tajwan (revision 279695)
|
||||
Vorarlberg (revision 257780)
|
||||
Sważiland (revision 281435)
|
||||
Vjenna (revision 276498)
|
||||
Ċina (revision 266233)
|
||||
Slovakkja (revision 266528)
|
||||
Finlandja (revision 282145)
|
||||
New Zealand (revision 281423)
|
||||
Eġittu (revision 283121)
|
||||
Russja (revision 266526)
|
||||
San Marino (revision 279324)
|
||||
Amsterdam (revision 283582)
|
||||
Belġju (revision 276022)
|
||||
Dizzjunarju (revision 280267)
|
||||
Filippini (revision 266237)
|
||||
Indja (revision 281401)
|
||||
Estonja (revision 274160)
|
||||
Kosovo (revision 277587)
|
||||
Lista ta' kodiċi telefoniċi (revision 257699)
|
||||
Kumitat Ewropew tar-Reġjuni (revision 257772)
|
||||
Pajjiżi tad-dinja (revision 272996)
|
||||
Lingwa Franċiża (revision 274763)
|
||||
Tajlandja (revision 279268)
|
||||
Nepal (revision 282662)
|
||||
Repubblika tal-Irlanda (revision 280123)
|
||||
Slovenja (revision 279330)
|
||||
Belt tal-Vatikan (revision 282714)
|
||||
Barbados (revision 281384)
|
||||
Burma (revision 266367)
|
||||
Franza (revision 283381)
|
||||
Emirati Għarab Magħquda (revision 278957)
|
||||
Korea t'Isfel (revision 266423)
|
||||
Armenja (revision 278995)
|
||||
Liechtenstein (revision 279758)
|
||||
Rumanija (revision 266525)
|
||||
Spanja (revision 274001)
|
||||
Ġibiltà (revision 276370)
|
||||
Vanwatu (revision 281454)
|
||||
Ungerija (revision 268808)
|
||||
Lingwa Latina (revision 283488)
|
||||
Ġermanja (revision 279831)
|
||||
Malażja (revision 281411)
|
||||
Polonja (revision 266426)
|
||||
Mawrizju (revision 281415)
|
||||
Norveġja (revision 279820)
|
||||
Soċjaliżmu Demokratiku (revision 258041)
|
||||
Andorra (revision 278993)
|
||||
Taħdit (revision 268012)
|
||||
Lussemburgu (revision 279759)
|
||||
Unjoni Sovjetika (revision 274565)
|
||||
Sri Lanka (revision 281445)
|
||||
Partit Laburista (Malta) (revision 279298)
|
||||
Transnistrija (revision 266548)
|
||||
Baħar Mediterran (revision 273028)
|
||||
Awstralja (revision 281381)
|
||||
Belt kapitali (revision 274120)
|
||||
Pajjiż interkjuż (revision 272618)
|
||||
Albanija (revision 272682)
|
||||
Ġamajka (revision 281389)
|
||||
Lingwa Spanjola (revision 281303)
|
||||
Lista ta' pajjiżi skont il-popolazzjoni (revision 274453)
|
||||
Danimarka (revision 280266)
|
||||
Professjoni (revision 266989)
|
||||
Italja (revision 277251)
|
||||
Indoewropew (revision 270688)
|
||||
Saħara tal-Punent (revision 269703)
|
||||
Letteratura ta' Malta (revision 281220)
|
||||
Iran (revision 283340)
|
||||
Rewwixta tal-Qassisin (revision 283044)
|
||||
Dante Alighieri (revision 260951)
|
||||
Iżvizzera (revision 268804)
|
||||
Nawru (revision 281420)
|
||||
Messiku (revision 283374)
|
||||
Gżejjer Faroe (revision 262423)
|
||||
Ażerbajġan (revision 283345)
|
||||
Uganda (revision 281453)
|
||||
Fiġi (revision 281457)
|
||||
Komunikazzjoni f'Malta (revision 280753)
|
||||
Kenja (revision 281404)
|
||||
Butan (revision 266370)
|
||||
Timor tal-Lvant (revision 266383)
|
||||
Kanada (revision 281405)
|
||||
Imperu Ruman (revision 283043)
|
||||
Ġużè Aquilina (revision 271449)
|
||||
Bożnija-Ħerzegovina (revision 266494)
|
||||
Ukrajna (revision 274996)
|
||||
Stati Uniti tal-Amerika (revision 280264)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:03:07.916796
|
||||
|
||||
50 characters appeared 482389 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 12.462141549662203 %
|
||||
[ 1] Char i: 11.963995862260541 %
|
||||
[ 2] Char l: 8.083724960560875 %
|
||||
[ 3] Char t: 7.7219837102421485 %
|
||||
[ 4] Char e: 6.614578690641785 %
|
||||
[ 5] Char n: 6.13115141514421 %
|
||||
[ 6] Char r: 5.558169858765436 %
|
||||
[ 7] Char u: 4.270827071098222 %
|
||||
[ 8] Char o: 3.764596622228119 %
|
||||
[ 9] Char m: 3.570976950137752 %
|
||||
[10] Char j: 3.490751240181679 %
|
||||
[11] Char s: 3.4030626734855067 %
|
||||
[12] Char k: 2.734515090518233 %
|
||||
[13] Char d: 2.418794790096789 %
|
||||
[14] Char b: 2.0983065534247256 %
|
||||
[15] Char f: 2.0477249688529384 %
|
||||
[16] Char p: 2.041505921569522 %
|
||||
[17] Char ħ: 1.6422430859741828 %
|
||||
[18] Char g: 1.5356900758516467 %
|
||||
[19] Char w: 1.4484161123077017 %
|
||||
[20] Char z: 1.169388190858415 %
|
||||
[21] Char ż: 1.0070710567612444 %
|
||||
[22] Char h: 0.9239431247395773 %
|
||||
[23] Char ġ: 0.7997694806473613 %
|
||||
[24] Char v: 0.7346767857476021 %
|
||||
[25] Char ċ: 0.6720717097612093 %
|
||||
[26] Char x: 0.6161002842104609 %
|
||||
[27] Char q: 0.5605434618119401 %
|
||||
[28] Char c: 0.2748818899270091 %
|
||||
[29] Char à: 0.10033396283911945 %
|
||||
[30] Char y: 0.06820221854146757 %
|
||||
|
||||
The first 31 characters have an accumulated ratio of 0.9993013936884962.
|
||||
The first 4 characters have an accumulated ratio of 0.4023184608272577.
|
||||
All characters whose order is over 23 have an accumulated ratio of 0.030268103128388086.
|
||||
|
||||
922 sequences found.
|
||||
|
||||
First 517 (typical positive ratio): 0.9950257898046807
|
||||
Next 170 (687-517): 0.003980403371696695
|
||||
Rest: 0.0009938068236226005
|
||||
|
||||
- Processing end: 2022-12-15 00:03:07.989063
|
||||
246
script/BuildLangModelLogs/LangNorwegianModel.log
Normal file
246
script/BuildLangModelLogs/LangNorwegianModel.log
Normal file
@ -0,0 +1,246 @@
|
||||
= Logs of language model for Norwegian (no) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-14 20:35:35.304097
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Norsk (revision 23146192)
|
||||
Saft (revision 23143694)
|
||||
Hund (revision 23005187)
|
||||
Norsk hvalfangst (revision 22116809)
|
||||
Genom (revision 20692187)
|
||||
Afghansk mynde (revision 22889802)
|
||||
2014 (revision 22881078)
|
||||
Richard Hammond (revision 22995054)
|
||||
Hedmark (revision 23104871)
|
||||
Dalmål (revision 23037578)
|
||||
Hundesykdommer (revision 22891810)
|
||||
Harald V (revision 23093221)
|
||||
Chihuahua (hund) (revision 23039799)
|
||||
Ringerike (revision 23014741)
|
||||
Rein (revision 23153751)
|
||||
Aggresjon (revision 22817402)
|
||||
Pitbuller (revision 23147194)
|
||||
Skogflåttencefalitt (revision 21355504)
|
||||
Veier i Norge (revision 23131794)
|
||||
G (revision 22001469)
|
||||
Verdensarven i Norge (revision 22706392)
|
||||
Chow-chow (revision 22303220)
|
||||
Hundeveddeløp (revision 21781069)
|
||||
Likvider (revision 16002002)
|
||||
Lindesnes fyr (revision 22558342)
|
||||
Mikroskop (revision 22990820)
|
||||
Kuala Lumpur (revision 18685439)
|
||||
Jernbane i Norge (revision 23171661)
|
||||
Klima- og miljødepartementet (revision 22759198)
|
||||
Fonetikk (revision 17407257)
|
||||
Utstillingshund (revision 21776318)
|
||||
Norges klima (revision 22292013)
|
||||
Implosiver (revision 16001522)
|
||||
Skogflått (revision 22676979)
|
||||
Rundormer (revision 20692220)
|
||||
Veddemål (revision 23085870)
|
||||
Steinalderen i Norge (revision 23157425)
|
||||
1975 (revision 23106834)
|
||||
Gælere (revision 22256357)
|
||||
Miniature bull terrier (revision 22876839)
|
||||
Dalarna (revision 22998224)
|
||||
Hunderase (revision 23087465)
|
||||
Norges fylker (revision 23129287)
|
||||
Terrier (revision 22337755)
|
||||
Ukraina (revision 23098675)
|
||||
Canadian Kennel Club (revision 17312293)
|
||||
Engelsk (revision 22974505)
|
||||
Eurasia (revision 22441158)
|
||||
Livmorinfeksjon (hund) (revision 22374115)
|
||||
Biologisk vektor (revision 18577508)
|
||||
Alfabetisk liste over hunderaser (revision 23118211)
|
||||
1900-tallet (revision 22240557)
|
||||
ÅDT (revision 22583842)
|
||||
CCA impregnering (revision 21777910)
|
||||
Antikken (revision 22943525)
|
||||
Shiba inu (revision 22388321)
|
||||
Hetshund (revision 22890705)
|
||||
Artikulasjonsmåte (revision 15203824)
|
||||
Århundre (revision 21299134)
|
||||
Flått (revision 22681781)
|
||||
Amazon Prime (revision 22812913)
|
||||
1920-årene (revision 20456232)
|
||||
18. mars (revision 22952061)
|
||||
Svalbards geologi (revision 22935346)
|
||||
Manchester (revision 23119484)
|
||||
Første verdenskrig (revision 23140432)
|
||||
Irland (revision 23159279)
|
||||
NKK (revision 23063493)
|
||||
Norsk Myndeklubb (revision 23045651)
|
||||
Kaukasisk mynde (revision 22267265)
|
||||
Villrein (revision 22756825)
|
||||
Lure coursing (revision 21812707)
|
||||
Nj (bokstav) (revision 21771118)
|
||||
¥ (revision 22965075)
|
||||
Bukspyttkjertel (revision 22542308)
|
||||
Hunere (revision 23166843)
|
||||
Sørsamer (revision 22992697)
|
||||
Ivar Mortensson-Egnund (revision 22698099)
|
||||
Bouvetøya (revision 22949651)
|
||||
Programleder (revision 22850937)
|
||||
Lillestrøm (revision 22991400)
|
||||
Terriere (revision 22337755)
|
||||
Basepar (revision 23057051)
|
||||
Ɋ (revision 15223809)
|
||||
Oslo Børs (revision 23151615)
|
||||
Rudolf er rød på nesen (revision 21000110)
|
||||
DNA (revision 23057051)
|
||||
Jonas Gahr Støre (revision 23102305)
|
||||
Anders Fogh Rasmussen (revision 22814443)
|
||||
1990-årene (revision 22530346)
|
||||
Obstruenter (revision 15267134)
|
||||
Huset Oldenburg (revision 23016074)
|
||||
Sinne (revision 22815834)
|
||||
Den Fredrikshaldske kongevei (revision 20821113)
|
||||
Amerikansk pitbullterrier (revision 23085299)
|
||||
Fjellrein (revision 22756825)
|
||||
1902 (revision 20888070)
|
||||
Familiehund (revision 23005187)
|
||||
Ƥ (revision 15223129)
|
||||
Afrikaans (revision 22756323)
|
||||
Andre verdenskrig (revision 23160663)
|
||||
Norges geografi (revision 23133696)
|
||||
Nord-Amerika (revision 23054666)
|
||||
Mygg (revision 22985964)
|
||||
Bulldogger (revision 20902118)
|
||||
Klikkelyder (revision 16001775)
|
||||
Kong Olav Vs minnemedalje (revision 21777896)
|
||||
Liste over Norges høyeste fjell (revision 23050964)
|
||||
Sjodogg (revision 22140307)
|
||||
Tamkatt (revision 23104429)
|
||||
Sveamål (revision 21833006)
|
||||
Idrettsgallaen 2009 (revision 20635966)
|
||||
Mitokondrielt DNA (revision 20740594)
|
||||
Flapp (revision 22064978)
|
||||
Storbritannia (revision 23168796)
|
||||
Hvithval (revision 22943982)
|
||||
Kakao (revision 23034788)
|
||||
Kultur (revision 23117718)
|
||||
1987 (revision 23169663)
|
||||
Tamhund (revision 23005187)
|
||||
Mongolia (revision 23162110)
|
||||
Liste over flyplasser i Norge (revision 23113767)
|
||||
14. september (revision 21835246)
|
||||
Energi i Norge (revision 23103284)
|
||||
Gen (revision 21771198)
|
||||
Aztekerne (revision 23145419)
|
||||
VM i fotball 2014 (revision 23069784)
|
||||
Karachi (revision 23125037)
|
||||
Halvvokal (revision 22225000)
|
||||
Kronprins (revision 22279985)
|
||||
Eidsvoll (revision 23104580)
|
||||
2012 (revision 23066457)
|
||||
Bivirkning (revision 20255322)
|
||||
Norsk reservistforbund (revision 22942543)
|
||||
The Movie Database (revision 23017607)
|
||||
Liste over fjorder i Norge (revision 22782077)
|
||||
Russland (revision 23170591)
|
||||
Utmark (revision 22556790)
|
||||
Tilde (revision 20755981)
|
||||
Østerdalsmål (revision 21922703)
|
||||
Ejektiver (revision 20859577)
|
||||
Isbre (revision 22963670)
|
||||
Chortaj (revision 22706807)
|
||||
Brukshundsport (revision 19057868)
|
||||
Smerte (revision 22213330)
|
||||
Horten (revision 23002692)
|
||||
Norges Bank (revision 23082042)
|
||||
Kennelhoste (revision 15214233)
|
||||
Mustang (hest) (revision 22360602)
|
||||
Psykologi (revision 23160138)
|
||||
Amerikansk staffordshireterrier (revision 22994828)
|
||||
Ragnhild, fru Lorentzen (revision 22658738)
|
||||
Norsk film (revision 22983409)
|
||||
Sindh (mynde) (revision 22891367)
|
||||
Patellaluksasjon (revision 23131481)
|
||||
Gameshow (revision 20178756)
|
||||
FCI (revision 22172054)
|
||||
Limburgsk (revision 18402103)
|
||||
Guangdong (revision 19816178)
|
||||
Nepal (revision 23163450)
|
||||
Stortingets transport- og kommunikasjonskomité (revision 23031702)
|
||||
Nynorsk (revision 23017339)
|
||||
Dyr (revision 23101991)
|
||||
Vegdirektoratet (revision 22996967)
|
||||
Tysk (revision 23093771)
|
||||
Liste over stortingsrepresentanter 2017–2021 (revision 22503649)
|
||||
Høy- og senmiddelalder i Norge (revision 21766452)
|
||||
2006 (revision 23157594)
|
||||
2004 (revision 22926786)
|
||||
Sonanter (revision 15157359)
|
||||
Smitte (revision 20772346)
|
||||
Vakthund (revision 19794728)
|
||||
Liste over hunderaser (revision 22927266)
|
||||
Liste over norske geografiske ytterpunkter (revision 22232516)
|
||||
Hjernehinnebetennelse (revision 22147847)
|
||||
Wimbledon (London) (revision 20153934)
|
||||
Abbor (revision 22392180)
|
||||
Zoologi (revision 21442012)
|
||||
Kamphunder (revision 22266192)
|
||||
1600-tallet (revision 20456202)
|
||||
Grunnbeløpet i folketrygden (revision 22658432)
|
||||
1000 (revision 20456192)
|
||||
Luftforsvaret (revision 23000239)
|
||||
Oppland (revision 23086810)
|
||||
Japan (revision 23159630)
|
||||
James May (revision 22051450)
|
||||
Unionen mellom Sverige og Norge (revision 22922743)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-14 20:38:56.878849
|
||||
|
||||
64 characters appeared 1913385 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 14.97764433190393 %
|
||||
[ 1] Char r: 8.902181212876656 %
|
||||
[ 2] Char n: 8.316726638914803 %
|
||||
[ 3] Char t: 7.665838291823131 %
|
||||
[ 4] Char s: 6.791314868675149 %
|
||||
[ 5] Char a: 6.277095304917725 %
|
||||
[ 6] Char i: 6.1219252790212115 %
|
||||
[ 7] Char o: 5.313253736179599 %
|
||||
[ 8] Char l: 5.160644616739443 %
|
||||
[ 9] Char d: 4.388034817875127 %
|
||||
[10] Char k: 3.977453570504629 %
|
||||
[11] Char g: 3.5606007154859056 %
|
||||
[12] Char m: 3.224338018746881 %
|
||||
[13] Char v: 2.4138372570078683 %
|
||||
[14] Char f: 2.0508157009697476 %
|
||||
[15] Char u: 1.9050530865455724 %
|
||||
[16] Char p: 1.7718336874178484 %
|
||||
[17] Char b: 1.5361780300357741 %
|
||||
[18] Char h: 1.314372172876865 %
|
||||
[19] Char å: 1.0595881121677027 %
|
||||
[20] Char j: 0.8951152015929883 %
|
||||
[21] Char y: 0.8807950307962067 %
|
||||
[22] Char ø: 0.7883933447790172 %
|
||||
[23] Char c: 0.29617667118745056 %
|
||||
[24] Char æ: 0.1901865019324391 %
|
||||
[25] Char w: 0.08696629272206063 %
|
||||
[26] Char z: 0.04562594564084071 %
|
||||
[27] Char x: 0.024981903798765016 %
|
||||
[28] Char é: 0.02148025619517243 %
|
||||
[29] Char q: 0.008414406928035916 %
|
||||
|
||||
The first 30 characters have an accumulated ratio of 0.9996686500625853.
|
||||
The first 5 characters have an accumulated ratio of 0.46653705344193663.
|
||||
All characters whose order is over 19 have an accumulated ratio of 0.03238135555572977.
|
||||
|
||||
1072 sequences found.
|
||||
|
||||
First 458 (typical positive ratio): 0.9950153891390712
|
||||
Next 162 (620-458): 0.003987433002112573
|
||||
Rest: 0.000997177858816256
|
||||
|
||||
- Processing end: 2022-12-14 20:38:56.956190
|
||||
250
script/BuildLangModelLogs/LangPolishModel.log
Normal file
250
script/BuildLangModelLogs/LangPolishModel.log
Normal file
@ -0,0 +1,250 @@
|
||||
= Logs of language model for Polish (pl) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:13:38.394852
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Krasnyj Krym (revision 66824963)
|
||||
Operacja desantowa kerczeńsko-teodozyjska (revision 68227227)
|
||||
Stal Kruppa (revision 44611245)
|
||||
Kronsztad (revision 68581947)
|
||||
Długość całkowita (statek) (revision 64884292)
|
||||
Avro 504 (revision 66779136)
|
||||
Rosyjska Federacyjna Socjalistyczna Republika Radziecka (revision 68953662)
|
||||
Hulk (okręt) (revision 61976707)
|
||||
AG Vulcan Stettin (revision 62678993)
|
||||
II wojna światowa (revision 68838789)
|
||||
45 mm armata morska 21-K (revision 65496261)
|
||||
Wyporność (revision 67657939)
|
||||
Armia Imperium Rosyjskiego (revision 68692314)
|
||||
Okręt podwodny (revision 68767081)
|
||||
Royal Navy (revision 68205433)
|
||||
Gęstość (revision 67599137)
|
||||
Kirgistan (revision 68953659)
|
||||
Królestwo Nepalu (revision 69019817)
|
||||
Forsowanie Odry (1945) (revision 67498395)
|
||||
1913 (revision 68859860)
|
||||
Wielkie Księstwo Moskiewskie (revision 68845644)
|
||||
Mühlhausen/Thüringen (revision 64700635)
|
||||
Okręt desantowy (revision 68857265)
|
||||
Kazachstan (revision 68913726)
|
||||
Awizo (klasa okrętów) (revision 67989058)
|
||||
Encyklopedia Britannica (revision 68175162)
|
||||
Konflikt izraelsko-arabski (revision 68493174)
|
||||
Znak wolnej burty (revision 63934239)
|
||||
Australia (revision 69013390)
|
||||
Sobór św. Mikołaja w Kronsztadzie (revision 68397490)
|
||||
Orlando Figes (revision 68849447)
|
||||
Thomas Spencer Vaughan Phillips (revision 63723659)
|
||||
Encyklopedia PWN (internetowa) (revision 68423009)
|
||||
Okręt (revision 68076001)
|
||||
Konstrukcyjna linia wodna (revision 59497856)
|
||||
Godło Rosyjskiej FSRR (revision 64813172)
|
||||
T-28 (revision 68673624)
|
||||
Airco DH.1 (revision 68388298)
|
||||
Samolot bombowy (revision 68816315)
|
||||
Powrotnik (revision 68623552)
|
||||
Fairey F.2 (revision 67777553)
|
||||
Niemcy (revision 68880193)
|
||||
Operacja biełgorodzko-bogoduchowska (revision 67580369)
|
||||
Tallinn (revision 68863189)
|
||||
Ciężar właściwy (revision 64801779)
|
||||
Japonia (revision 69024870)
|
||||
1922 (revision 69018633)
|
||||
Erich von Manstein (revision 68197757)
|
||||
Petersburski Uniwersytet Państwowy (revision 67269929)
|
||||
Związek Socjalistycznych Republik Radzieckich (revision 69029154)
|
||||
Virtual International Authority File (revision 61134022)
|
||||
Bitwa stalingradzka (revision 67469898)
|
||||
Rosja (revision 68550011)
|
||||
Sopwith Snipe (revision 65656049)
|
||||
Smoleńsk (revision 68499198)
|
||||
Niemieckie okręty podwodne (revision 68937051)
|
||||
Bitwa w rejonie Dubno – Łuck – Brody (revision 68737435)
|
||||
Töröbaj Kułatow (revision 60409630)
|
||||
Siły Zbrojne Południa Rosji (revision 67581714)
|
||||
Sułtan Ibraimow (revision 63215605)
|
||||
Operacja desantowa kerczeńsko-eltigeńska (revision 68637949)
|
||||
Marynarka Wojenna Związku Socjalistycznych Republik Radzieckich (revision 67702441)
|
||||
MS Goya (revision 67941500)
|
||||
SMS A 13 (revision 66790436)
|
||||
Erich Hoepner (revision 66017135)
|
||||
Władcy Prus (revision 68323919)
|
||||
Lądowanie w zatoce Lingayen (revision 68419739)
|
||||
Republika Nowogrodzka (revision 67326541)
|
||||
Tallin (1939) (revision 56881640)
|
||||
Artyleria przeciwlotnicza (revision 67839959)
|
||||
Pojemność rejestrowa (revision 67716637)
|
||||
Królestwo Islandii (revision 65512641)
|
||||
Ententa (revision 68167669)
|
||||
Szybkostrzelność (revision 57824961)
|
||||
Siła wyporu (revision 64381886)
|
||||
Celownik (element broni) (revision 60852867)
|
||||
Operacja berlińska (revision 67819323)
|
||||
302 Dywizja Strzelecka (ZSRR) (revision 67827339)
|
||||
Stocznia (revision 67598238)
|
||||
Wydawnictwo Dolnośląskie (revision 65401619)
|
||||
Artyleria okrętowa (revision 68924054)
|
||||
Arsenał (revision 65305881)
|
||||
Bitwa białostocko-mińska (revision 67141687)
|
||||
Edward Śmigły-Rydz (revision 69027011)
|
||||
Sopwith Camel (revision 66404857)
|
||||
Besarabia (revision 67455676)
|
||||
Krasnyj Kawkaz (1916) (revision 68695905)
|
||||
Bitwa o Łódź (1945) (revision 68842103)
|
||||
Borsig (revision 66142577)
|
||||
Zamek (broń) (revision 67003123)
|
||||
Moc (revision 68381359)
|
||||
Store norske leksikon (revision 66724573)
|
||||
Brazylia (revision 68986158)
|
||||
Kuter torpedowy (revision 67253500)
|
||||
Samolot wielozadaniowy (revision 56523743)
|
||||
SMS Weißenburg (revision 68849441)
|
||||
Ofensywna operacja tichwińska (revision 67142132)
|
||||
Kontrola autorytatywna (revision 64027474)
|
||||
Charków (revision 68742074)
|
||||
Lidzbark Warmiński (revision 68815317)
|
||||
Grand Fleet (revision 62710613)
|
||||
Statek wodny (revision 68480347)
|
||||
BAE Hawk (revision 68490080)
|
||||
Czerwona Ukraina (revision 68695903)
|
||||
Molo (revision 68232968)
|
||||
De Havilland Venom (revision 67653896)
|
||||
Bukowina (kraina historyczna) (revision 67524307)
|
||||
System prezydencki (revision 66658106)
|
||||
Karl-Adolf Hollidt (revision 68040819)
|
||||
Etanol (revision 68334503)
|
||||
ZiS-6 (revision 68336496)
|
||||
Proch (revision 64704332)
|
||||
Język angielski (revision 68974067)
|
||||
Pułk (revision 68342303)
|
||||
Pływalność (revision 62268316)
|
||||
Medal „Za obronę Stalingradu” (revision 68677329)
|
||||
Antymon (revision 67894068)
|
||||
Abdy Sujerkułow (revision 60192153)
|
||||
Pszczyna (revision 68842474)
|
||||
Prawo Archimedesa (revision 68593352)
|
||||
Wolna burta (revision 67693175)
|
||||
Karabin automatyczny (revision 69035912)
|
||||
Pacyficzny pierścień ognia (revision 67805171)
|
||||
Operacja Cartwheel (revision 68194441)
|
||||
Lista państw Azji (revision 68953116)
|
||||
Eskortowiec (revision 68452765)
|
||||
Kołobrzeg (revision 68594253)
|
||||
Kilogram na metr sześcienny (revision 62047241)
|
||||
Encykłopedija suczasnoji Ukrajiny (revision 68377152)
|
||||
6 Armia (ZSRR) (revision 68916634)
|
||||
Język francuski (revision 68876046)
|
||||
Gmina Harku (revision 67377654)
|
||||
Mitar Martinović (revision 67607990)
|
||||
Stany Zjednoczone (revision 68987968)
|
||||
Powiat Rybnik (revision 68950256)
|
||||
Demokracja (revision 68826388)
|
||||
Objętość (revision 65687559)
|
||||
Mikołaj Romanow (1856–1929) (revision 68298239)
|
||||
Wydawnictwo Naukowe PWN (revision 68880877)
|
||||
Liban (revision 68635073)
|
||||
31 grudnia (revision 68619846)
|
||||
Richard Pipes (revision 68676471)
|
||||
Republika Rosyjska (revision 68522243)
|
||||
Tona (revision 64478254)
|
||||
Encyklopedia internetowa (revision 67889911)
|
||||
Karabin powtarzalny (revision 69017633)
|
||||
Żmija zygzakowata (revision 68336630)
|
||||
Edvard Beneš (revision 68402916)
|
||||
Lata 1950-1959 (revision 67855414)
|
||||
Okręt patrolowy (revision 67764276)
|
||||
Pas Taiheiyō (revision 67625354)
|
||||
Torpedowiec (revision 67691361)
|
||||
Conrad Albrecht (revision 68384954)
|
||||
Przyspieszenie ziemskie (revision 68942776)
|
||||
Gyanendra Bir Bikram Shah Dev (revision 66267434)
|
||||
Organizacja Narodów Zjednoczonych (revision 68946932)
|
||||
1939 (revision 68948356)
|
||||
Batu-chan (revision 68902695)
|
||||
Godło Kirgiskiej SRR (revision 67909362)
|
||||
Atak atomowy na Hiroszimę i Nagasaki (revision 68959464)
|
||||
III Rzesza (revision 68950644)
|
||||
Armia Ochotnicza (Rosja) (revision 68679547)
|
||||
Samolot (revision 66993131)
|
||||
Unia Europejska (revision 68812895)
|
||||
Sonar (revision 68441472)
|
||||
Komodor (revision 67241059)
|
||||
Online (revision 67120633)
|
||||
Kocioł kurlandzki (revision 66573412)
|
||||
Belém (revision 67878012)
|
||||
Robert von Greim (revision 67984145)
|
||||
Granica (matematyka) (revision 67167465)
|
||||
Kanonierka (revision 68462848)
|
||||
Bristol Scout (revision 50081031)
|
||||
Operacja sandomiersko-śląska (revision 66833578)
|
||||
Krążownik lekki (revision 68464189)
|
||||
Chiny (revision 68764503)
|
||||
1543 (revision 66983943)
|
||||
Monarchia (revision 68774253)
|
||||
Online Computer Library Center (revision 68370224)
|
||||
Tursunbek Czyngyszew (revision 68582395)
|
||||
Biblioteka Narodowa Korei (revision 67437611)
|
||||
Order Łaźni (revision 67884695)
|
||||
Zimowe Igrzyska Olimpijskie 2010 (revision 69033451)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:39:03.247215
|
||||
|
||||
81 characters appeared 2120558 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 8.864789362045274 %
|
||||
[ 1] Char i: 8.27046466071666 %
|
||||
[ 2] Char o: 8.072403584339593 %
|
||||
[ 3] Char e: 7.2753963815184495 %
|
||||
[ 4] Char n: 5.956828344237696 %
|
||||
[ 5] Char r: 5.377641168032188 %
|
||||
[ 6] Char z: 4.9300702928191535 %
|
||||
[ 7] Char w: 4.771527116919226 %
|
||||
[ 8] Char s: 4.459911023419308 %
|
||||
[ 9] Char c: 3.991873836980644 %
|
||||
[10] Char t: 3.9560813710353595 %
|
||||
[11] Char k: 3.8696890158156485 %
|
||||
[12] Char y: 3.733498447106846 %
|
||||
[13] Char d: 3.201232883043048 %
|
||||
[14] Char p: 2.9419143451865026 %
|
||||
[15] Char m: 2.612661384409198 %
|
||||
[16] Char u: 2.413232743457147 %
|
||||
[17] Char l: 2.370838241632627 %
|
||||
[18] Char j: 2.2605370850502555 %
|
||||
[19] Char ł: 1.7070035339754912 %
|
||||
[20] Char g: 1.4294350826527735 %
|
||||
[21] Char b: 1.3428069404373755 %
|
||||
[22] Char h: 1.307344576286053 %
|
||||
[23] Char ę: 0.9403656961988307 %
|
||||
[24] Char ą: 0.8652439593729575 %
|
||||
[25] Char ó: 0.8151156440899046 %
|
||||
[26] Char ż: 0.5589094945764275 %
|
||||
[27] Char ś: 0.48477806313243965 %
|
||||
[28] Char f: 0.42903801735203656 %
|
||||
[29] Char ń: 0.2847835333907396 %
|
||||
[30] Char ć: 0.1965048822055327 %
|
||||
[31] Char v: 0.1065285646513795 %
|
||||
[32] Char x: 0.06677487717855395 %
|
||||
[33] Char ź: 0.06503005341047027 %
|
||||
[34] Char ō: 0.012496710771410166 %
|
||||
[35] Char ü: 0.007592341261120894 %
|
||||
[36] Char é: 0.007545183861983497 %
|
||||
[37] Char q: 0.006932137673197337 %
|
||||
|
||||
The first 38 characters have an accumulated ratio of 0.999648205802435.
|
||||
The first 6 characters have an accumulated ratio of 0.43817523500889866.
|
||||
All characters whose order is over 24 have an accumulated ratio of 0.030420295035551964.
|
||||
|
||||
1353 sequences found.
|
||||
|
||||
First 583 (typical positive ratio): 0.9950348611960773
|
||||
Next 199 (782-583): 0.003966625938834456
|
||||
Rest: 0.0009985128650882302
|
||||
|
||||
- Processing end: 2022-12-15 00:39:03.703819
|
||||
256
script/BuildLangModelLogs/LangPortugueseModel.log
Normal file
256
script/BuildLangModelLogs/LangPortugueseModel.log
Normal file
@ -0,0 +1,256 @@
|
||||
= Logs of language model for Portuguese (pt) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:13:40.621625
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Papagaio-das-mascarenhas (revision 61083234)
|
||||
François-Nicolas Martinet (revision 43679514)
|
||||
Tanygnathus (revision 63727477)
|
||||
Alfred Newton (revision 63772066)
|
||||
Aves (revision 64642129)
|
||||
Julian Hume (revision 41876605)
|
||||
Ilha da Reunião (revision 64417746)
|
||||
Tirosina (revision 64330501)
|
||||
Pedro Mascarenhas (c. 1484-1555) (revision 64281128)
|
||||
Herpetologista (revision 60800107)
|
||||
Praslin (revision 60991639)
|
||||
August von Pelzeln (revision 62048504)
|
||||
INaturalist (revision 62752196)
|
||||
Peixe (revision 64431170)
|
||||
Oswald Heer (revision 64579144)
|
||||
Biólogo (revision 61723910)
|
||||
Coral (revision 62383136)
|
||||
Royal Society (revision 63227627)
|
||||
Johann Natterer (revision 63305664)
|
||||
Família (biologia) (revision 61575111)
|
||||
UNESCO (revision 64651554)
|
||||
Paridade do poder de compra (revision 64194230)
|
||||
Lorena (França) (revision 57151319)
|
||||
Área (revision 63988916)
|
||||
Ecologia (revision 64022144)
|
||||
Masiakasaurus (revision 64018705)
|
||||
1984 (revision 64860161)
|
||||
Animalia (revision 64303459)
|
||||
Biblioteca Nacional da Austrália (revision 63908354)
|
||||
Malaca Portuguesa (revision 64517772)
|
||||
México (revision 64868116)
|
||||
PubMed Identifier (revision 64178664)
|
||||
Owen Willans Richardson (revision 58168602)
|
||||
William Burnside (revision 62739863)
|
||||
Endémico (revision 64450772)
|
||||
Amendoim (revision 64423017)
|
||||
Cisteína (revision 64443908)
|
||||
Réptil (revision 64240956)
|
||||
Omnívoro (revision 64303184)
|
||||
Psittaciformes (revision 63932960)
|
||||
Joel Serrão (revision 62566046)
|
||||
Áustria (revision 64777663)
|
||||
Seicheles (revision 64635903)
|
||||
Chordata (revision 64103327)
|
||||
Anfíbio (revision 64657407)
|
||||
Johann Georg Wagler (revision 61847261)
|
||||
Feniletilamina (revision 64766772)
|
||||
Aminoácido essencial (revision 62163188)
|
||||
Ictiologia (revision 64184350)
|
||||
Georg von Frauenfeld (revision 62413353)
|
||||
Sistema de acasalamento (revision 64465607)
|
||||
Oxford University Press (revision 63009975)
|
||||
Coordenadas geográficas (revision 64874098)
|
||||
Digital object identifier (revision 63209667)
|
||||
John Desmond Bernal (revision 60419838)
|
||||
John Edward Marr (revision 62745345)
|
||||
Encefalina (revision 64330411)
|
||||
Conquiliologia (revision 56999872)
|
||||
Quilómetro quadrado (revision 64134927)
|
||||
História da Itália (revision 63544997)
|
||||
Identificação automatizada de espécies (revision 60520809)
|
||||
Gil Eanes (revision 64787644)
|
||||
Registro CAS (revision 62829292)
|
||||
Ronald Ross (revision 63575195)
|
||||
Biologia regenerativa (revision 56549505)
|
||||
Santo Eustáquio (Países Baixos) (revision 63516356)
|
||||
Cabo da Boa Esperança (revision 64850246)
|
||||
Edward Mellanby (revision 59542666)
|
||||
Geografia da França (revision 63700063)
|
||||
Condorraptor (revision 64060396)
|
||||
Fudge (revision 64331291)
|
||||
Rapator (revision 64107459)
|
||||
Viena (revision 64653743)
|
||||
1973 (revision 64252513)
|
||||
Classe (biologia) (revision 63495321)
|
||||
História natural (revision 60797583)
|
||||
Francisco de Mascarenhas (revision 64533486)
|
||||
Henry John Carter (revision 64088767)
|
||||
Garcia de Noronha (revision 61943288)
|
||||
Essuatíni (revision 64541626)
|
||||
Etologia (revision 63703415)
|
||||
1825 (revision 64231448)
|
||||
Pitohui (revision 55136936)
|
||||
Doador de electrões (revision 49471221)
|
||||
Francisco Xavier Soares da Veiga (revision 42160927)
|
||||
Tanygnathus megalorynchos (revision 63460044)
|
||||
Engenharia biológica (revision 64476460)
|
||||
Biologia forense (revision 59861252)
|
||||
Califórnia (revision 64085181)
|
||||
Cupuaçu (revision 64791967)
|
||||
Classificação científica (revision 63914619)
|
||||
Ilha Europa (revision 58458237)
|
||||
Pedro Álvares Cabral (revision 64766295)
|
||||
1882 (revision 60523806)
|
||||
Arquipélago (revision 64873918)
|
||||
Tristão Teixeira (revision 63759821)
|
||||
Ornitologia (revision 63950590)
|
||||
Maiote (revision 63509604)
|
||||
Manuel Duarte Leitão (revision 62776308)
|
||||
Biodiversidade (revision 64635148)
|
||||
Dynamoterror (revision 64149681)
|
||||
Columbiformes (revision 61584181)
|
||||
Los Angeles (revision 64907059)
|
||||
Ilha de Linosa (revision 55210386)
|
||||
Inteligência artificial (revision 64867398)
|
||||
Megaraptora (revision 64096312)
|
||||
Árabes (revision 64244377)
|
||||
Gráfico semi-log (revision 53359355)
|
||||
Densidade populacional (revision 64809653)
|
||||
Garcia de Sá (revision 58468727)
|
||||
Ferdinand von Hochstetter (revision 63490806)
|
||||
Römpp Lexikon Chemie (revision 58796446)
|
||||
Chocolate quente (revision 64330451)
|
||||
Histologia (revision 61422516)
|
||||
Henry Dale (revision 58667524)
|
||||
Estêvão da Gama (c. 1470) (revision 64693733)
|
||||
Espécie (revision 64553712)
|
||||
Reino (biologia) (revision 62163157)
|
||||
África Austral (revision 61960381)
|
||||
Metro (revision 64654584)
|
||||
Sudão (revision 64456425)
|
||||
Fenol (revision 64404823)
|
||||
Lista de disciplinas da biologia (revision 61981999)
|
||||
Ornitólogo (revision 63950590)
|
||||
Porfiriato (revision 64906132)
|
||||
853 a.C. (revision 63744132)
|
||||
Pedro de Noronha (revision 64269853)
|
||||
International Standard Name Identifier (revision 64790504)
|
||||
Padrão-ouro (revision 64448730)
|
||||
Corpo (anatomia) (revision 64637457)
|
||||
Produtividade (ecologia) (revision 63242479)
|
||||
Bioinformática (revision 63353600)
|
||||
Sterculioideae (revision 59214802)
|
||||
Áries (revision 64192868)
|
||||
Filipe II de Espanha (revision 64462191)
|
||||
Biologia (revision 64766112)
|
||||
Bioestatística (revision 64552825)
|
||||
Hospital (revision 63940681)
|
||||
Cecil Edgar Tilley (revision 62726767)
|
||||
Ascendência (revision 58302798)
|
||||
Ostafrikasaurus (revision 64071145)
|
||||
Carl Edward Hellmayr (revision 62499688)
|
||||
África do Sul (revision 64803180)
|
||||
Lamberto Dini (revision 61581701)
|
||||
Hipótese Gaia (revision 63036733)
|
||||
Alberto do Canto (revision 64484219)
|
||||
Real (moeda portuguesa) (revision 64536085)
|
||||
Biomedicina (revision 64851943)
|
||||
Evolução (revision 64809463)
|
||||
Magnoliophyta (revision 64552676)
|
||||
Protostomia (revision 64698835)
|
||||
John Joly (revision 62745300)
|
||||
Base Virtual Internacional de Autoridade (revision 61190425)
|
||||
Joseph Barcroft (revision 53561143)
|
||||
Diogo Cão (revision 64617588)
|
||||
Hadeano (revision 64828835)
|
||||
Gabriel Soares de Sousa (revision 64695192)
|
||||
Partido socialista francês (revision 64269594)
|
||||
António Pais de Sande (revision 52559072)
|
||||
Cranganor (revision 61413974)
|
||||
Bovinos (revision 63721509)
|
||||
1880 (revision 58173615)
|
||||
Celêntero (revision 58975856)
|
||||
Língua grega antiga (revision 64775316)
|
||||
Herpetologia (revision 60800107)
|
||||
Luís Mascarenhas, 2.º conde de Alva (revision 64555516)
|
||||
Custo marginal (revision 62175678)
|
||||
Crocodilo-de-água-salgada (revision 64007088)
|
||||
Geólogo (revision 64608075)
|
||||
Washington, D.C. (revision 64536061)
|
||||
Buenos Aires (revision 64811726)
|
||||
Lisboa (revision 64898256)
|
||||
Chocolate (revision 64868103)
|
||||
Eukaryota (revision 64256026)
|
||||
Eixo terrestre (revision 64647487)
|
||||
Namíbia (revision 64658419)
|
||||
Lagos (Algarve) (revision 64828967)
|
||||
Igreja Copta (revision 64842664)
|
||||
Alexis Carrel (revision 63190094)
|
||||
Temnospondyli (revision 64390652)
|
||||
Gestão hospitalar (revision 63982120)
|
||||
Joaquim Augusto Mouzinho de Albuquerque (revision 64891429)
|
||||
Pantyrannosauria (revision 64848070)
|
||||
Aspirina (revision 64769177)
|
||||
Cracklers de chocolate (revision 64330464)
|
||||
Ilhas Crozet (revision 63024289)
|
||||
Sparidae (revision 59205456)
|
||||
Suliformes (revision 61403162)
|
||||
Miguel Corte Real (revision 64244782)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:16:37.497792
|
||||
|
||||
52 characters appeared 1621926 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 11.982174279221123 %
|
||||
[ 1] Char e: 11.377091186650933 %
|
||||
[ 2] Char o: 10.194793104001047 %
|
||||
[ 3] Char s: 8.025890207074799 %
|
||||
[ 4] Char i: 7.1634587521255595 %
|
||||
[ 5] Char r: 6.492281398781449 %
|
||||
[ 6] Char d: 5.492112463823873 %
|
||||
[ 7] Char n: 5.366706002616642 %
|
||||
[ 8] Char t: 4.890543711611997 %
|
||||
[ 9] Char m: 4.4280071963825725 %
|
||||
[10] Char c: 4.01473310126356 %
|
||||
[11] Char u: 3.616749469457916 %
|
||||
[12] Char l: 3.2010091705786823 %
|
||||
[13] Char p: 2.7590038016530967 %
|
||||
[14] Char g: 1.3863764438081638 %
|
||||
[15] Char v: 1.197341925587234 %
|
||||
[16] Char f: 1.1109014837914923 %
|
||||
[17] Char b: 1.0721820847560246 %
|
||||
[18] Char h: 0.8071884907202919 %
|
||||
[19] Char ã: 0.7111298542596888 %
|
||||
[20] Char ç: 0.6203735558835607 %
|
||||
[21] Char q: 0.6127899793208814 %
|
||||
[22] Char é: 0.6002739952377606 %
|
||||
[23] Char í: 0.41370568077705144 %
|
||||
[24] Char á: 0.40106638650591947 %
|
||||
[25] Char x: 0.345946732464983 %
|
||||
[26] Char z: 0.3207914541107301 %
|
||||
[27] Char ó: 0.27467344379459974 %
|
||||
[28] Char j: 0.20426332644029382 %
|
||||
[29] Char ê: 0.182190802786317 %
|
||||
[30] Char õ: 0.15703552443206412 %
|
||||
[31] Char y: 0.1389705818884462 %
|
||||
[32] Char ú: 0.09833987493880732 %
|
||||
[33] Char k: 0.0737394924306041 %
|
||||
[34] Char w: 0.07330790677256546 %
|
||||
[35] Char â: 0.07207480489245502 %
|
||||
[36] Char à: 0.0646761936117924 %
|
||||
[37] Char ô: 0.04346684127389289 %
|
||||
|
||||
The first 38 characters have an accumulated ratio of 0.9998736070572886.
|
||||
The first 4 characters have an accumulated ratio of 0.415799487769479.
|
||||
All characters whose order is over 21 have an accumulated ratio of 0.03464523042358283.
|
||||
|
||||
1068 sequences found.
|
||||
|
||||
First 514 (typical positive ratio): 0.9950108744191293
|
||||
Next 183 (697-514): 0.003990843236141739
|
||||
Rest: 0.0009982823447289846
|
||||
|
||||
- Processing end: 2022-12-15 00:16:37.626796
|
||||
227
script/BuildLangModelLogs/LangRomanianModel.log
Normal file
227
script/BuildLangModelLogs/LangRomanianModel.log
Normal file
@ -0,0 +1,227 @@
|
||||
= Logs of language model for Romanian (ro) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:01:17.765077
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
The Loving Kind (revision 15340411)
|
||||
Limba engleză (revision 15174203)
|
||||
Whole Lotta History (revision 15221846)
|
||||
The Promise (revision 15302845)
|
||||
Chemistry (revision 13003795)
|
||||
Untouchable (revision 12020867)
|
||||
31 ianuarie (revision 14777533)
|
||||
Neil Tennant (revision 13355922)
|
||||
Dance (revision 14700085)
|
||||
The Guardian (revision 15212051)
|
||||
Billboard (revision 13092896)
|
||||
Sound of the Underground (cântec) (revision 15206321)
|
||||
Compozitor (revision 15313365)
|
||||
The Show (revision 10112441)
|
||||
Compact Disc (revision 13258410)
|
||||
Gen muzical (revision 15348917)
|
||||
Disc single (revision 13271042)
|
||||
29 noiembrie (revision 15270237)
|
||||
Zimbabwe (revision 15223871)
|
||||
Republica Irlanda (revision 15335833)
|
||||
Limba pali (revision 14710607)
|
||||
1954 (revision 15272524)
|
||||
Call the Shots (revision 15311533)
|
||||
Limbi indo-iraniene (revision 13016907)
|
||||
Casă de discuri (revision 15244458)
|
||||
Mary Higgins Clark (revision 14158157)
|
||||
See the Day (revision 10112431)
|
||||
Mai (revision 15170552)
|
||||
Normanzi (revision 15181050)
|
||||
Listă de limbi (revision 15276205)
|
||||
5 decembrie (revision 15333253)
|
||||
The Sound of Girls Aloud (revision 10112480)
|
||||
22 mai (revision 14998993)
|
||||
2009 (revision 15348935)
|
||||
Biblioteca Nacional de España (revision 15237290)
|
||||
Can't Speak French (revision 15243027)
|
||||
Bibliothèque nationale de France (revision 15237314)
|
||||
MSN Search (revision 15237622)
|
||||
5 septembrie (revision 15347684)
|
||||
27 aprilie (revision 14912864)
|
||||
Limba franceză (revision 15326202)
|
||||
Uniunea Europeană (revision 15216020)
|
||||
2005 (revision 15348977)
|
||||
Irlanda (revision 15335833)
|
||||
Statele Unite ale Americii (revision 15339104)
|
||||
Consoană oclusivă (revision 13880727)
|
||||
Contratenor (revision 14250562)
|
||||
3 septembrie (revision 15102675)
|
||||
Mixed Up (revision 10112443)
|
||||
Sri Lanka (revision 15339014)
|
||||
Anglia (revision 15109546)
|
||||
Girls Aloud (revision 15319932)
|
||||
Pet Shop Boys (revision 13165657)
|
||||
Regatul Unit al Marii Britanii și al Irlandei de Nord (revision 15335741)
|
||||
Noua Zeelandă (revision 15181159)
|
||||
1921 (revision 15196999)
|
||||
Parlophone (revision 15295705)
|
||||
1834 (revision 15086768)
|
||||
Something Kinda Ooooh (revision 15206082)
|
||||
1987 (revision 15272755)
|
||||
No Good Advice (revision 10112436)
|
||||
Limba maghiară (revision 15329180)
|
||||
1935 (revision 14962293)
|
||||
Biology (revision 10112430)
|
||||
Muzică pop (revision 15177633)
|
||||
Tangled Up (revision 13010794)
|
||||
26 aprilie (revision 14916666)
|
||||
British Broadcasting Corporation (revision 14882345)
|
||||
Girls A Live (revision 10112444)
|
||||
13 aprilie (revision 15215645)
|
||||
Wake Me Up (revision 10112439)
|
||||
Sexy! No No No... (revision 12017812)
|
||||
I Think We're Alone Now (revision 15152417)
|
||||
1725 (revision 14748670)
|
||||
1903 (revision 14907631)
|
||||
ASP.NET (revision 13678267)
|
||||
Al Doilea Război Mondial (revision 15346198)
|
||||
Dick Durock (revision 14802579)
|
||||
Life Got Cold (revision 10112437)
|
||||
MusicBrainz (revision 15177442)
|
||||
Nicolae Popovici (jurist) (revision 15200517)
|
||||
National Diet Library (revision 12675764)
|
||||
BBC Three (revision 15290069)
|
||||
Friedrich Schleiermacher (revision 14711103)
|
||||
The Beatles (revision 15302748)
|
||||
20 ianuarie (revision 14947182)
|
||||
2 iunie (revision 15000116)
|
||||
Universal Media Disc (revision 13269523)
|
||||
Castelul Bunratty (revision 8799348)
|
||||
Londra (revision 15290324)
|
||||
23 noiembrie (revision 15307048)
|
||||
20 iulie (revision 15036777)
|
||||
2001 (revision 15111207)
|
||||
Florida (revision 15142921)
|
||||
Uzbekistan (revision 15298947)
|
||||
1938 (revision 15163477)
|
||||
23 iunie (revision 14994443)
|
||||
28 aprilie (revision 15140389)
|
||||
1811 (revision 14233359)
|
||||
Crișana (revision 15314665)
|
||||
Pronunție (revision 14476477)
|
||||
I'll Stand By You (cântec de Girls Aloud) (revision 10112432)
|
||||
2019 (revision 15344837)
|
||||
Calendarul armean (revision 14268830)
|
||||
2002 (revision 15294674)
|
||||
Australia (revision 15309171)
|
||||
Serghei Prokofiev (revision 15269322)
|
||||
Limbi ugrice (revision 15165135)
|
||||
WorldCat Identities (revision 13000969)
|
||||
Rwanda (revision 14914537)
|
||||
Brașov (revision 15335383)
|
||||
Rudolf Hess (revision 15198812)
|
||||
Limba daneză (revision 14842105)
|
||||
Lista țărilor după indicele dezvoltării umane (revision 15314050)
|
||||
Postpoziție (revision 15346785)
|
||||
16 iunie (revision 14987301)
|
||||
9 mai (revision 14936959)
|
||||
Erasmus din Rotterdam (revision 15139499)
|
||||
1939 (revision 15344797)
|
||||
Lista orașelor din Statele Unite ale Americii după populație (revision 14835883)
|
||||
Benjamin Henry Latrobe (revision 15309615)
|
||||
Wayback Machine (revision 15154168)
|
||||
Love Machine (revision 10112433)
|
||||
Discografia formației Girls Aloud (revision 15316070)
|
||||
Anii 1950 (revision 15053828)
|
||||
Ross Brawn (revision 14956382)
|
||||
Djibouti (revision 15324881)
|
||||
Cuvânt (revision 12985155)
|
||||
1 martie (revision 15348743)
|
||||
Domenico Scarlatti (revision 15271887)
|
||||
Hawaii (revision 15282894)
|
||||
Listă de compozitori de muzică cultă (revision 14649633)
|
||||
Premiul Nobel pentru Fizică (revision 15191205)
|
||||
4 mai (revision 15222001)
|
||||
27 octombrie (revision 15314197)
|
||||
8 noiembrie (revision 15277041)
|
||||
Accidentul nuclear de la Cernobîl (revision 15345489)
|
||||
Consoană africată dentală surdă (revision 14997698)
|
||||
Extended play (revision 14728849)
|
||||
Divertisment (revision 12383285)
|
||||
25 decembrie (revision 15332780)
|
||||
15 ianuarie (revision 14749726)
|
||||
Iulian Cristache (revision 11040565)
|
||||
Stat unitar (revision 15207224)
|
||||
Limba marathi (revision 15165081)
|
||||
Consoană fricativă laterală alveolară sonoră (revision 13946216)
|
||||
James Abbott McNeill Whistler (revision 15285621)
|
||||
Regatul Unit (revision 15335741)
|
||||
Albania (revision 15331282)
|
||||
Henry Cowell (revision 15119343)
|
||||
Limba valenciană (revision 15165114)
|
||||
Bronx (revision 15211973)
|
||||
Integrated Authority File (revision 15145168)
|
||||
Menuet (revision 14224105)
|
||||
Jocurile Olimpice de vară (revision 15157901)
|
||||
Acuzativ (revision 15315694)
|
||||
National and University Library in Zagreb (revision 14932231)
|
||||
20 septembrie (revision 15109959)
|
||||
1 ianuarie (revision 14833650)
|
||||
12 septembrie (revision 15058394)
|
||||
ITunes (revision 14303931)
|
||||
Tokyo (revision 15215196)
|
||||
Paris (revision 15295690)
|
||||
Universal Music Group (revision 15070153)
|
||||
Premiul Nobel pentru Literatură (revision 15129756)
|
||||
Flandra (revision 15318704)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:04:31.827603
|
||||
|
||||
68 characters appeared 1537323 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char i: 11.164992652812714 %
|
||||
[ 1] Char e: 11.007836349290292 %
|
||||
[ 2] Char a: 10.768654342646276 %
|
||||
[ 3] Char r: 7.448857527012866 %
|
||||
[ 4] Char n: 7.210586194313101 %
|
||||
[ 5] Char t: 6.14821999020375 %
|
||||
[ 6] Char l: 5.709080004657447 %
|
||||
[ 7] Char u: 5.164171745300109 %
|
||||
[ 8] Char o: 5.019569732580596 %
|
||||
[ 9] Char c: 4.31893622875609 %
|
||||
[10] Char s: 3.679578071752 %
|
||||
[11] Char d: 3.4889219767088635 %
|
||||
[12] Char m: 3.302168769998237 %
|
||||
[13] Char p: 2.6401088125267105 %
|
||||
[14] Char ă: 2.0153864867695335 %
|
||||
[15] Char b: 1.5493165717289081 %
|
||||
[16] Char g: 1.3016783070311184 %
|
||||
[17] Char f: 1.1247473692906436 %
|
||||
[18] Char v: 0.9899025774024066 %
|
||||
[19] Char ș: 0.92596025688811 %
|
||||
[20] Char ț: 0.8636441398456929 %
|
||||
[21] Char î: 0.8400967135728796 %
|
||||
[22] Char z: 0.793652342416005 %
|
||||
[23] Char h: 0.719497464098306 %
|
||||
[24] Char â: 0.4213818436333809 %
|
||||
[25] Char k: 0.327907668069755 %
|
||||
[26] Char j: 0.2703400651652255 %
|
||||
[27] Char x: 0.23144127811787113 %
|
||||
[28] Char y: 0.2307907967291194 %
|
||||
[29] Char w: 0.18811921762700487 %
|
||||
[30] Char é: 0.02998719202145548 %
|
||||
[31] Char q: 0.02205131907868418 %
|
||||
|
||||
The first 32 characters have an accumulated ratio of 0.9991758400804515.
|
||||
The first 4 characters have an accumulated ratio of 0.4039034087176214.
|
||||
All characters whose order is over 21 have an accumulated ratio of 0.03235169186956807.
|
||||
|
||||
1295 sequences found.
|
||||
|
||||
First 487 (typical positive ratio): 0.9950167482401342
|
||||
Next 267 (754-487): 0.003984360305270163
|
||||
Rest: 0.0009988914545956407
|
||||
|
||||
- Processing end: 2022-12-15 00:04:31.911782
|
||||
270
script/BuildLangModelLogs/LangRussianModel.log
Normal file
270
script/BuildLangModelLogs/LangRussianModel.log
Normal file
@ -0,0 +1,270 @@
|
||||
= Logs of language model for Russian (ru) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-17 19:53:30.416132
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Пулмен (рабочий посёлок) (revision 127314030)
|
||||
Водонапорная башня (revision 123368499)
|
||||
Обама, Барак (revision 127312814)
|
||||
Историзм (искусство) (revision 125199154)
|
||||
Насосная станция (revision 126671775)
|
||||
Школьный округ (revision 118138873)
|
||||
Конденсат (revision 97819205)
|
||||
1880-е годы (revision 124959394)
|
||||
Линкольн, Роберт Тодд (revision 126851305)
|
||||
Габарит подвижного состава (revision 127265050)
|
||||
Межвоенный период (revision 123201828)
|
||||
Гражданская война в США (revision 127311614)
|
||||
История евреев в США (revision 123703208)
|
||||
Англо-занзибарская война (revision 127263956)
|
||||
Линкольн, Джесси Харлан (revision 87795509)
|
||||
Бенкен, Герман (revision 120809711)
|
||||
УралГАХУ (revision 126489964)
|
||||
Великобритания (revision 127175319)
|
||||
Фленсбург (revision 126961771)
|
||||
Мещанство (revision 127304945)
|
||||
Прохоров, Александр Михайлович (revision 127233579)
|
||||
VIAF (revision 122626337)
|
||||
Национальная библиотека Чешской Республики (revision 124152023)
|
||||
Регулирующая арматура (revision 116046805)
|
||||
Раннее Средневековье (revision 126932807)
|
||||
Европейская интеграция (revision 125721443)
|
||||
Бойл, Уиллард (revision 120835257)
|
||||
Бут, Эдвин (revision 126437526)
|
||||
Московский трамвай (revision 127184149)
|
||||
Лондонский метрополитен (revision 126810923)
|
||||
F-18 (revision 127113399)
|
||||
Сацумско-британская война (revision 124671983)
|
||||
Луизианская покупка (revision 123941200)
|
||||
Община (Германия) (revision 125007479)
|
||||
Запорная арматура (revision 121220496)
|
||||
Новая Англия (revision 125214368)
|
||||
Берни Сандерс (revision 126983575)
|
||||
Бак (резервуар) (revision 126670363)
|
||||
Хемингуэй, Эрнест (revision 126959711)
|
||||
2021 год (revision 127125948)
|
||||
1951 год (revision 126285688)
|
||||
Жидкость (revision 127133343)
|
||||
Большая советская энциклопедия (revision 127144085)
|
||||
Россия (revision 127297047)
|
||||
CSS Virginia (revision 121318647)
|
||||
Школа реки Гудзон (revision 123627995)
|
||||
Водозаборные сооружения (revision 123836554)
|
||||
Ривера, Диего (revision 125976771)
|
||||
Квантовая физика (revision 126896053)
|
||||
Рочестер (Нью-Йорк) (revision 126016553)
|
||||
Конденсация (теплотехника) (revision 123837631)
|
||||
Средиземноморская Антанта (revision 125156636)
|
||||
Историография (revision 121180824)
|
||||
Гбови, Лейма (revision 124860814)
|
||||
Премудрый пискарь (revision 121359555)
|
||||
Люнебургская водонапорная башня (revision 117681965)
|
||||
XVIII век (revision 126913825)
|
||||
Сислей, Альфред (revision 127063100)
|
||||
Средние века (revision 127154753)
|
||||
Энциклопедический словарь Брокгауза и Ефрона (revision 125357601)
|
||||
Нефтепровод (revision 123810227)
|
||||
Нефть (revision 126997759)
|
||||
Вентиляция (revision 126675588)
|
||||
Цилиндр (revision 126783664)
|
||||
Английский язык (revision 127275941)
|
||||
Бензин (revision 126966322)
|
||||
Министр по делам ветеранов США (revision 124072400)
|
||||
Первобытное общество (revision 127057340)
|
||||
Пикассо, Пабло (revision 126869217)
|
||||
Рисунок в разрезе (revision 121960314)
|
||||
Междупутье (revision 125745955)
|
||||
Битва при Форт-Генри (revision 123999672)
|
||||
Канал (водный) (revision 123736265)
|
||||
Белорусская народная республика (revision 126958885)
|
||||
25 апреля (revision 127246597)
|
||||
Насос (revision 126768788)
|
||||
Теннесси (revision 124804069)
|
||||
Локомотив (revision 127032264)
|
||||
Габарит погрузки (revision 123372556)
|
||||
Вебби (revision 121964659)
|
||||
Алегзандрия (Виргиния) (revision 126338837)
|
||||
Война Фаррапус (revision 125765352)
|
||||
Образование в США (revision 126788195)
|
||||
Пресс-конференция (revision 127075029)
|
||||
Рио-де-Жанейро (revision 127002708)
|
||||
Габарит приближения строений (revision 117538368)
|
||||
Международный идентификатор стандартных наименований (revision 120216410)
|
||||
Мопассан, Ги де (revision 127086462)
|
||||
История Европейского союза (revision 123952687)
|
||||
Прусский социализм (revision 127165836)
|
||||
Библиотека Александрина (revision 126093192)
|
||||
Тэйкан-дзукури (revision 124877986)
|
||||
1883 год (revision 125476166)
|
||||
Конфликт на Китайско-Восточной железной дороге (revision 122499702)
|
||||
Энергетический уровень (revision 119322956)
|
||||
Алюминий (revision 126861293)
|
||||
Санкт-петербургский трамвай (revision 127306763)
|
||||
Национальная библиотека Франции (revision 127015965)
|
||||
12 мая (revision 127207333)
|
||||
Граммофон (revision 126498827)
|
||||
Маккьяйоли (revision 126836176)
|
||||
Канализационная установка (revision 123736401)
|
||||
Газ (revision 126950046)
|
||||
Луизиана (revision 127312945)
|
||||
Память Парижской Коммуны (revision 126960401)
|
||||
Сталь (revision 127216605)
|
||||
Семья Барака Обамы (revision 124529726)
|
||||
Поверхностный насос (revision 121146223)
|
||||
Каразин, Николай Николаевич (revision 127097562)
|
||||
Кирпичная готика (revision 125337841)
|
||||
The Century Magazine (revision 127098805)
|
||||
Контрольный номер Библиотеки Конгресса (revision 113360170)
|
||||
Русско-персидская война (1804—1813) (revision 126999654)
|
||||
Берн (revision 122913269)
|
||||
Поздняя античность (revision 127266287)
|
||||
Гарвардский университет (revision 127033732)
|
||||
Бои на Халхин-Голе (revision 126542980)
|
||||
Алый знак доблести (фильм, 1951) (revision 120728355)
|
||||
Водопровод (revision 127182411)
|
||||
Пар (revision 126003244)
|
||||
1971 год (revision 127068279)
|
||||
Искусство Древнего Египта (revision 125737336)
|
||||
Пенсильванский университет (Индиана) (revision 123963620)
|
||||
Национальная библиотека Израиля (revision 126108080)
|
||||
1884 год (revision 125476122)
|
||||
Проезд снаружи поездов (revision 127239100)
|
||||
Норвегия (revision 126986958)
|
||||
Барбур, Джеймс (revision 126851158)
|
||||
Французская интервенция в Испанию (revision 119666106)
|
||||
Англия (revision 127268120)
|
||||
Галлатин, Альберт (revision 127160198)
|
||||
Калифорния (revision 127027363)
|
||||
Роял, Кеннет Клайборн (revision 110605693)
|
||||
США (revision 126887888)
|
||||
Федеральная архитектура (revision 116000492)
|
||||
Конденсат Бозе — Эйнштейна (revision 125188375)
|
||||
Колонна (revision 126876842)
|
||||
1907 год (revision 127134918)
|
||||
13 сентября (revision 125587404)
|
||||
Генрих Лев (revision 126407574)
|
||||
Этрусское искусство (revision 123158050)
|
||||
Амальрик, Андрей Алексеевич (revision 126033545)
|
||||
9 декабря (revision 127201233)
|
||||
Селищи (22712000298) (revision 124521248)
|
||||
1798 год (revision 125783094)
|
||||
Мюледорф (Берн) (revision 121861015)
|
||||
Большая игра (revision 126891168)
|
||||
Битва (revision 124395796)
|
||||
Война не-персе (revision 127189710)
|
||||
Президентские выборы в США (2020) (revision 126639368)
|
||||
Площадь Карла Фаберже (revision 123223942)
|
||||
Банкрофт, Джордж (revision 126851184)
|
||||
Кобаяси, Макото (revision 121939251)
|
||||
Газойль (revision 123647640)
|
||||
Ватиканская апостольская библиотека (revision 124986491)
|
||||
Общественная собственность (revision 125722109)
|
||||
Славная революция (revision 122270271)
|
||||
Золя (revision 127092383)
|
||||
Офицер (revision 126230098)
|
||||
Метастабильное состояние (revision 118552209)
|
||||
Лыжные гонки (revision 124233040)
|
||||
Средиземное море (revision 126980465)
|
||||
Защитная арматура (revision 124665168)
|
||||
Президент Турции (revision 123861767)
|
||||
Макдональд, Артур (revision 123992590)
|
||||
Песок (revision 126799930)
|
||||
Сублимация (физика) (revision 127108939)
|
||||
Новицкий, Василий Фёдорович (revision 126350745)
|
||||
Список султанов Занзибара (revision 94020222)
|
||||
Туман (revision 124866163)
|
||||
2005 год (revision 127291761)
|
||||
Исламская Республика Афганистан (revision 126605442)
|
||||
Викисловарь (revision 126840626)
|
||||
22 января (revision 126465130)
|
||||
Российская национальная библиотека (revision 126055277)
|
||||
Наука в США (revision 124150312)
|
||||
Екатеринбургский завод (revision 125779202)
|
||||
Океания (revision 125374219)
|
||||
Нидершерли (revision 116230829)
|
||||
Война за австрийское наследство (revision 126874381)
|
||||
Доминиканская Республика (revision 127046641)
|
||||
Военный паровоз (revision 124117506)
|
||||
Подземные воды (revision 126705165)
|
||||
5 сентября (revision 126628763)
|
||||
Кафка, Франц (revision 127130321)
|
||||
Двухванная печь (revision 123510834)
|
||||
Чертаново Южное (revision 122081039)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-17 19:57:30.506110
|
||||
|
||||
63 characters appeared 2343890 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char о: 10.136567842347551 %
|
||||
[ 1] Char и: 8.217151828797427 %
|
||||
[ 2] Char а: 7.941797609956098 %
|
||||
[ 3] Char е: 7.781337861418411 %
|
||||
[ 4] Char н: 6.689093771465385 %
|
||||
[ 5] Char с: 5.755304216494801 %
|
||||
[ 6] Char р: 5.58695160609073 %
|
||||
[ 7] Char т: 5.486136294791991 %
|
||||
[ 8] Char в: 4.621547939536412 %
|
||||
[ 9] Char л: 4.156039745892512 %
|
||||
[10] Char к: 3.458694733967891 %
|
||||
[11] Char м: 2.899666793236884 %
|
||||
[12] Char д: 2.856064064439884 %
|
||||
[13] Char п: 2.69799350652121 %
|
||||
[14] Char у: 2.0648579924825823 %
|
||||
[15] Char я: 1.9596482770095867 %
|
||||
[16] Char г: 1.812798382176638 %
|
||||
[17] Char ы: 1.7729500957809452 %
|
||||
[18] Char б: 1.5043794717328884 %
|
||||
[19] Char з: 1.4936707780655236 %
|
||||
[20] Char й: 1.4190938994577391 %
|
||||
[21] Char ь: 1.2650764327677493 %
|
||||
[22] Char ч: 1.0549983147673312 %
|
||||
[23] Char х: 1.0016255029032932 %
|
||||
[24] Char ж: 0.7652236239755279 %
|
||||
[25] Char ц: 0.5965297006258826 %
|
||||
[26] Char ю: 0.5917513193878552 %
|
||||
[27] Char ш: 0.5520310253467526 %
|
||||
[28] Char ф: 0.4393977533075358 %
|
||||
[29] Char щ: 0.3068403380704726 %
|
||||
[30] Char э: 0.3063710327703092 %
|
||||
[31] Char i: 0.25978181569954223 %
|
||||
[32] Char ё: 0.24984107615971737 %
|
||||
[33] Char e: 0.2357619171548153 %
|
||||
[34] Char a: 0.21839762104876934 %
|
||||
[35] Char n: 0.18004257878996027 %
|
||||
[36] Char r: 0.1703151598411188 %
|
||||
[37] Char t: 0.16216631326555428 %
|
||||
[38] Char s: 0.15969179441014725 %
|
||||
[39] Char o: 0.1568759626091668 %
|
||||
[40] Char l: 0.1263711180985456 %
|
||||
[41] Char c: 0.09795681537956133 %
|
||||
[42] Char d: 0.08571221345711616 %
|
||||
[43] Char h: 0.07956858043679524 %
|
||||
[44] Char m: 0.07009714619713382 %
|
||||
[45] Char u: 0.0688598867694303 %
|
||||
[46] Char x: 0.05725524661993524 %
|
||||
[47] Char p: 0.05644462837419845 %
|
||||
[48] Char b: 0.05482339188272487 %
|
||||
[49] Char g: 0.051111613599614324 %
|
||||
[50] Char f: 0.05038632359027087 %
|
||||
[51] Char y: 0.04923439239896071 %
|
||||
[52] Char v: 0.0470158582527337 %
|
||||
[53] Char ъ: 0.03617917223077875 %
|
||||
|
||||
The first 54 characters have an accumulated ratio of 0.9991548238185242.
|
||||
The first 5 characters have an accumulated ratio of 0.40765948913984873.
|
||||
All characters whose order is over 29 have an accumulated ratio of 0.030302616590369.
|
||||
|
||||
1554 sequences found.
|
||||
|
||||
First 819 (typical positive ratio): 0.9950050289366638
|
||||
Next 260 (1079-819): 0.003999322715788067
|
||||
Rest: 0.0009956483475481726
|
||||
|
||||
- Processing end: 2022-12-17 19:57:30.653466
|
||||
251
script/BuildLangModelLogs/LangSerbianModel.log
Normal file
251
script/BuildLangModelLogs/LangSerbianModel.log
Normal file
@ -0,0 +1,251 @@
|
||||
= Logs of language model for Serbian (sr) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-17 22:32:34.945303
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Ратно_ваздухопловство_и_противваздушна_одбрана_Војске_Републике_Српске (revision 24582261)
|
||||
Рат у Босни и Херцеговини (revision 25415566)
|
||||
Момчило Крајишник (revision 25271534)
|
||||
Предсједник Републике Српске (revision 25359856)
|
||||
Драган Вуковић (revision 25343578)
|
||||
М53/59 Прага (revision 25379553)
|
||||
2. батаљон Војне полиције (revision 23891722)
|
||||
Никола Делић (revision 24741676)
|
||||
Источнобосански корпус Војске Републике Српске (revision 24462780)
|
||||
5. козарска лака пјешадијска бригада (revision 25381013)
|
||||
Радомир Чавић (revision 24221757)
|
||||
25. јануар (revision 25269630)
|
||||
6. санска лака пјешадијска бригада (revision 24929928)
|
||||
17. кључка лака пјешадијска бригада (revision 24355928)
|
||||
Орлови Грмеча (revision 23891727)
|
||||
Мићо Влаисављевић (revision 24915097)
|
||||
Цвјетко Савић (revision 25199639)
|
||||
Патриотска лига (БиХ) (revision 21762882)
|
||||
Упала плућа (revision 24386531)
|
||||
Милорад Кутлешић (revision 25169318)
|
||||
92. мјешовита авијацијска бригада (revision 24461739)
|
||||
Историја Републике Српске (1992—1995) (revision 25396800)
|
||||
17. август (revision 25162661)
|
||||
Социјалистичка партија (Република Српска) (revision 25339684)
|
||||
Сима Лозанић (revision 25227668)
|
||||
Орден за војне заслуге (revision 24140948)
|
||||
Живомир Нинковић (revision 24589570)
|
||||
2. подрињска лака пјешадијска бригада (revision 24490712)
|
||||
1. которварошка лака пјешадијска бригада (revision 23884431)
|
||||
Анте Марковић (revision 25363253)
|
||||
15. новембар (revision 25387946)
|
||||
Радивоје Милетић (revision 24232926)
|
||||
Policija Republike Srpske (revision 24293827)
|
||||
Стаменко Новаковић (revision 24933201)
|
||||
Самоопредељење (revision 25423052)
|
||||
18. децембар (revision 25259258)
|
||||
2. оклопна бригада (revision 24369237)
|
||||
Славко Лисица (revision 25424390)
|
||||
УНКРО (revision 24399325)
|
||||
25. децембар (revision 25394962)
|
||||
Београд (revision 25435556)
|
||||
Епархија захумско-херцеговачка и приморска (revision 25193025)
|
||||
Будимир Гаврић (revision 25205770)
|
||||
Први батаљон војне полиције 1.КК (revision 23998235)
|
||||
1950 (revision 25396634)
|
||||
Радивоје Томанић (revision 24051372)
|
||||
Јован Марић (revision 24589591)
|
||||
Мило Ђукановић (revision 25437661)
|
||||
Џон Херт (revision 24441263)
|
||||
Дринска бановина (revision 25409388)
|
||||
43. приједорска моторизована бригада (revision 25347814)
|
||||
Југословенска народна армија (revision 25388846)
|
||||
Питер Фајт (revision 25425148)
|
||||
2. сарајевска лака пјешадијска бригада (revision 24591755)
|
||||
2. теслићка лака пјешадијска бригада (revision 24027353)
|
||||
Драгиша Масал (revision 25302857)
|
||||
Карађорђева звијезда (revision 24721051)
|
||||
Перо Млађеновић (revision 25424880)
|
||||
Татра (revision 25171753)
|
||||
1831 (revision 24734221)
|
||||
Карингтон-Кутиљеров план (revision 25391410)
|
||||
Poli(metil metakrilat) (revision 20647284)
|
||||
3. сарајевска пјешадијска бригада (revision 24591756)
|
||||
Милутин Скочајић (revision 24619682)
|
||||
Владимир Арсић (revision 24238327)
|
||||
Гарда Пантери (revision 24200236)
|
||||
4. август (revision 25139596)
|
||||
Владо Спремо (revision 25339001)
|
||||
Манојло Миловановић (revision 25368228)
|
||||
Мировни планови прије и током Рата у БиХ (revision 24482990)
|
||||
Вашингтонски споразум (1994) (revision 22769830)
|
||||
Чедо Сладоје (revision 24464996)
|
||||
24. фебруар (revision 25270517)
|
||||
Сарајевско-романијски корпус Војске Републике Српске (revision 24591773)
|
||||
Момир Зец (revision 25274792)
|
||||
ЈНА (revision 25388846)
|
||||
Momir Talić (revision 24773518)
|
||||
Вељко Стојановић (revision 24591774)
|
||||
Здравко Толимир (revision 24593446)
|
||||
Новак Ђукић (revision 24724367)
|
||||
Оклоп (revision 23883339)
|
||||
Представништва Републике Српске у иностранству (revision 25423590)
|
||||
Словачка (revision 25283209)
|
||||
Ваздухопловни завод Космос (revision 25214359)
|
||||
Спасоје Орашанин (revision 24464248)
|
||||
Битка за Возућу (revision 25351440)
|
||||
Операција Намјерна сила (revision 25416485)
|
||||
Дејтонски мировни споразум (revision 25403210)
|
||||
1. херцеговачка моторизована бригада (Требиње) (revision 25162762)
|
||||
Москва (revision 25329265)
|
||||
Богдан Суботић (revision 24318915)
|
||||
Чехословачка (revision 25210184)
|
||||
11. мркоњићка лака пешадијска бригада (revision 23887232)
|
||||
13. новембар (revision 25357481)
|
||||
Светозар Андрић (revision 25255141)
|
||||
Мате Бобан (revision 24220533)
|
||||
Блаж Краљевић (revision 24737190)
|
||||
Логор Узамница (revision 24525764)
|
||||
Абасиди (revision 25422122)
|
||||
19. јануар (revision 25314987)
|
||||
Предсједник Владе Републике Српске (revision 25340553)
|
||||
Милован Станковић (политичар, 1958) (revision 25152054)
|
||||
Топ (revision 25201602)
|
||||
Емил Влајки (revision 24038256)
|
||||
7. извиђачко-диверзантски одред (revision 24073614)
|
||||
Никола Мишковић (revision 25228450)
|
||||
Инцидент код Мркоњић Града (revision 25389261)
|
||||
БОВ (оклопни транспортер) (revision 25252351)
|
||||
Мићо Грубор (revision 24289250)
|
||||
Социјалистичка Република Босна и Херцеговина (revision 24573038)
|
||||
13. фебруар (revision 25259071)
|
||||
Маринко Шиљеговић (revision 24589619)
|
||||
Европски ратови (revision 25166321)
|
||||
1991 (revision 25356221)
|
||||
1999 (revision 25425404)
|
||||
Станислав Галић (revision 24775466)
|
||||
Самостални пјешадијски батаљон Скелани (revision 24236929)
|
||||
Бијело Брдо (Дервента) (revision 23651156)
|
||||
Војска Републике Српске (revision 25349210)
|
||||
Радомир Лукић (revision 24268767)
|
||||
1961 (revision 24417631)
|
||||
Орден слободе (revision 25287659)
|
||||
Ослободилачка национална армија (revision 24118083)
|
||||
2. семберска лака пјешадијска бригада (revision 24461729)
|
||||
Операција Спреча 95 (revision 24403645)
|
||||
Драгомир Милошевић (revision 24780575)
|
||||
26. фебруар (revision 25147680)
|
||||
Operacija Una (revision 24725456)
|
||||
Божо Новак (revision 25300274)
|
||||
Херцеговачки корпус Војске Републике Српске (revision 24479790)
|
||||
2. крајишки корпус Војске Републике Српске (revision 25162755)
|
||||
Операција Звезда (revision 24403718)
|
||||
Војно медицински центар (revision 23886998)
|
||||
Бошко Келечевић (revision 23631478)
|
||||
Предузетник (revision 24933587)
|
||||
Бошко Гвозден (revision 25269514)
|
||||
15. март (revision 25314965)
|
||||
Територијална одбрана Српске Крајине (revision 24437954)
|
||||
Вукови са Вучијака (revision 25371082)
|
||||
4. јануар (revision 25388556)
|
||||
Република Босна и Херцеговина (revision 25139818)
|
||||
16. октобар (revision 25396928)
|
||||
Немања Недовић (revision 25359086)
|
||||
Операција Штит (revision 25375240)
|
||||
Митко Стојковски (revision 25232528)
|
||||
Српска академија наука и уметности (revision 25413602)
|
||||
Француска револуција (revision 25223439)
|
||||
Србија (revision 25433539)
|
||||
7. лаки артиљеријски пук ПВО Херцеговачког корпуса (revision 23891985)
|
||||
Рајко Балаћ (revision 24926978)
|
||||
Битка за Купрес (1994) (revision 25241940)
|
||||
Паравојска (revision 25390961)
|
||||
Саво Сокановић (revision 23929682)
|
||||
Брег (река) (revision 24516962)
|
||||
7. купрешка моторизована бригада (revision 23891702)
|
||||
Народна партија Српске (revision 25339710)
|
||||
Radislav Krstić (revision 25169830)
|
||||
Центар војних школа ВРС „Генерал Рајко Балаћ” (revision 25160226)
|
||||
Генерал (ЈНА) (revision 24124119)
|
||||
Ратно ваздухопловство и противваздушна одбрана Војске Републике Српске (revision 24582261)
|
||||
15. век (revision 25356672)
|
||||
Богдан Сладојевић (revision 25312818)
|
||||
Карађорђе Петровић (revision 25412434)
|
||||
Херцеговачки санџак (revision 23536553)
|
||||
7. март (revision 25315286)
|
||||
НАТО бомбардовање Републике Српске (revision 25416485)
|
||||
Владо Лиздек (revision 25116123)
|
||||
1998 (revision 25434166)
|
||||
1. добојска лака пјешадијска бригада (revision 25258891)
|
||||
Сан Марино (revision 25357929)
|
||||
Карловачка митрополија (revision 25413663)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-17 22:36:36.028806
|
||||
|
||||
64 characters appeared 1001054 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char а: 11.390294629460548 %
|
||||
[ 1] Char и: 8.957958311939217 %
|
||||
[ 2] Char о: 8.472270227180552 %
|
||||
[ 3] Char е: 8.20595092772218 %
|
||||
[ 4] Char р: 6.002373498332758 %
|
||||
[ 5] Char н: 5.695996419773559 %
|
||||
[ 6] Char с: 5.0323958547690735 %
|
||||
[ 7] Char к: 4.062318316494415 %
|
||||
[ 8] Char у: 4.039442427681224 %
|
||||
[ 9] Char т: 3.779216705592306 %
|
||||
[10] Char в: 3.5658416029504902 %
|
||||
[11] Char ј: 3.2442805283231473 %
|
||||
[12] Char д: 3.23958547690734 %
|
||||
[13] Char п: 3.0462892111714255 %
|
||||
[14] Char л: 2.7947543289373 %
|
||||
[15] Char м: 2.7568942334779143 %
|
||||
[16] Char б: 1.946947916895592 %
|
||||
[17] Char г: 1.7272794474623747 %
|
||||
[18] Char з: 1.4506709927736166 %
|
||||
[19] Char ц: 0.9910554275793314 %
|
||||
[20] Char ч: 0.8583952514050192 %
|
||||
[21] Char ш: 0.8464078860880633 %
|
||||
[22] Char х: 0.7302303372245653 %
|
||||
[23] Char њ: 0.5863819534210941 %
|
||||
[24] Char i: 0.5255460744375429 %
|
||||
[25] Char a: 0.5215502859985576 %
|
||||
[26] Char ћ: 0.4614136699918286 %
|
||||
[27] Char љ: 0.4285483100811744 %
|
||||
[28] Char e: 0.38309621658771653 %
|
||||
[29] Char o: 0.3636167479476632 %
|
||||
[30] Char ж: 0.3585221176879569 %
|
||||
[31] Char ф: 0.33494696589794354 %
|
||||
[32] Char n: 0.3133697083274229 %
|
||||
[33] Char ђ: 0.2959880286178368 %
|
||||
[34] Char r: 0.2686168778107874 %
|
||||
[35] Char s: 0.25932666969014656 %
|
||||
[36] Char t: 0.2007883690590118 %
|
||||
[37] Char u: 0.19029942440667535 %
|
||||
[38] Char j: 0.18690300423353784 %
|
||||
[39] Char k: 0.1803099533092121 %
|
||||
[40] Char l: 0.1803099533092121 %
|
||||
[41] Char p: 0.16782311443738301 %
|
||||
[42] Char m: 0.16312806302157526 %
|
||||
[43] Char c: 0.1390534376766888 %
|
||||
[44] Char v: 0.1382542799888917 %
|
||||
[45] Char d: 0.1287642824463016 %
|
||||
[46] Char b: 0.09040471343204262 %
|
||||
[47] Char g: 0.06942682412736975 %
|
||||
[48] Char z: 0.061834826093297664 %
|
||||
[49] Char h: 0.05254461797265682 %
|
||||
[50] Char џ: 0.04285483100811745 %
|
||||
|
||||
The first 51 characters have an accumulated ratio of 0.9993047328116168.
|
||||
The first 5 characters have an accumulated ratio of 0.43028847594635256.
|
||||
All characters whose order is over 31 have an accumulated ratio of 0.03130000978968167.
|
||||
|
||||
1174 sequences found.
|
||||
|
||||
First 658 (typical positive ratio): 0.9950262953064305
|
||||
Next 193 (851-658): 0.0039828494616473975
|
||||
Rest: 0.0009908552319221053
|
||||
|
||||
- Processing end: 2022-12-17 22:36:36.141687
|
||||
246
script/BuildLangModelLogs/LangSlovakModel.log
Normal file
246
script/BuildLangModelLogs/LangSlovakModel.log
Normal file
@ -0,0 +1,246 @@
|
||||
= Logs of language model for Slovak (sk) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:13:57.091311
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Európska_únia (revision 7505135)
|
||||
Atény (revision 7444559)
|
||||
Tanzánia (revision 7450772)
|
||||
Nobelova cena (revision 7344071)
|
||||
Paraguaj (revision 7405402)
|
||||
Kajláš Satjárthí (revision 7469915)
|
||||
Svätý Krištof a Nevis (revision 7435732)
|
||||
Mongolsko (revision 7405366)
|
||||
Európsky nástroj finančnej stability (revision 7370223)
|
||||
Theresa Mayová (revision 7246748)
|
||||
ISO 3166-1 (revision 7323373)
|
||||
Omán (revision 6960459)
|
||||
Autobus (revision 7491250)
|
||||
Virtual International Authority File (revision 6677294)
|
||||
Autorita (knihovníctvo) (revision 6677295)
|
||||
Nobelova cena za fyziku (revision 7446468)
|
||||
Zoznam štátov podľa počtu obyvateľov (revision 7491272)
|
||||
International Standard Name Identifier (revision 6903717)
|
||||
Les (revision 7449029)
|
||||
Tonga (revision 7296153)
|
||||
1968 (revision 7192352)
|
||||
.kn (revision 7409198)
|
||||
Latinčina (revision 7383946)
|
||||
Gabon (revision 7405669)
|
||||
Thajsko (revision 7505117)
|
||||
Letný čas (revision 7340851)
|
||||
Mauritánia (revision 7444213)
|
||||
University of Oxford (revision 7501403)
|
||||
Aruba (revision 7449746)
|
||||
Gemeinsame Normdatei (revision 6677292)
|
||||
Alma mater (revision 6621173)
|
||||
Írsko (revision 7430451)
|
||||
Angličtina (revision 7323198)
|
||||
India (revision 7417994)
|
||||
Rakúsko (revision 7449882)
|
||||
Kolumbia (revision 7421669)
|
||||
Konzervatívna a unionistická strana (revision 7102614)
|
||||
Pobrežie Slonoviny (revision 7303057)
|
||||
Srí Lanka (revision 7188164)
|
||||
Macao (revision 7042657)
|
||||
2017 (revision 7451970)
|
||||
Sofia (revision 7348882)
|
||||
Časové pásmo (revision 7482347)
|
||||
Paródia (revision 6824524)
|
||||
Národné autority Českej republiky (revision 6677293)
|
||||
Izrael (revision 7502535)
|
||||
Stavanger (revision 7037103)
|
||||
Svätý Tomáš a Princov ostrov (revision 7494437)
|
||||
Čad (revision 7474456)
|
||||
Kišiňov (revision 7220681)
|
||||
Sierra Leone (revision 7208455)
|
||||
Tanganika (jazero) (revision 7199102)
|
||||
Dynamit (revision 7389224)
|
||||
Albánsko (revision 7420007)
|
||||
Švédsko (revision 7504708)
|
||||
Stredná Afrika (revision 7145045)
|
||||
Závet (revision 6956611)
|
||||
1924 (revision 7421571)
|
||||
Portugalsko (revision 7506280)
|
||||
13. júl (revision 7199116)
|
||||
Čarodejníctvo (revision 7475115)
|
||||
Oslo (revision 7491164)
|
||||
Falklandské ostrovy (revision 7435087)
|
||||
Argentína (revision 7398639)
|
||||
Rieka (revision 7256138)
|
||||
Salvádor (revision 7405470)
|
||||
Štvorcový kilometer (revision 7325538)
|
||||
Jordánsko (revision 7156210)
|
||||
Nobelova cena za mier (revision 7447902)
|
||||
Kostarika (revision 7491077)
|
||||
Slovenský náučný slovník (revision 7208557)
|
||||
Nobelova cena za literatúru (revision 7447686)
|
||||
Grécka kolonizácia (revision 7352256)
|
||||
Uruguaj (revision 7405554)
|
||||
1974 (revision 7437299)
|
||||
Pireus (mesto) (revision 6924764)
|
||||
Karibské more (revision 7147501)
|
||||
Európska únia (revision 7505135)
|
||||
15. storočie (revision 7072979)
|
||||
Eastbourne (revision 6331454)
|
||||
Hongkong (revision 7507993)
|
||||
Library of Congress Control Number (revision 6676257)
|
||||
Medzinárodná agentúra pre atómovú energiu (revision 7362834)
|
||||
Americké Panenské ostrovy (revision 6888602)
|
||||
Holandsko (revision 7449775)
|
||||
Pontos (revision 6888197)
|
||||
Peňažná jednotka (revision 7005476)
|
||||
Fyzika (revision 7261378)
|
||||
Solón (revision 7482796)
|
||||
90. roky 20. storočia (revision 7481259)
|
||||
Peñón de Vélez de la Gomera (revision 5297137)
|
||||
Severné Macedónsko (revision 7433659)
|
||||
Martti Ahtisaari (revision 7454277)
|
||||
Šestnástková sústava (revision 7507601)
|
||||
Senegal (revision 7345517)
|
||||
Reykjavik (revision 7196501)
|
||||
Kaunas (revision 7037139)
|
||||
Ján Seneš (revision 7274343)
|
||||
1777 (revision 6703199)
|
||||
560 pred Kr. (revision 5886589)
|
||||
Mali (revision 7493855)
|
||||
Svätá Helena (zámorské územie) (revision 7435092)
|
||||
Barbados (revision 7277746)
|
||||
Grenada (revision 7435546)
|
||||
Bravčové mäso (revision 7054645)
|
||||
Nemecko (revision 7446359)
|
||||
Union List of Artist Names (revision 7149512)
|
||||
Rezina (okres) (revision 6454892)
|
||||
Spojené štáty (revision 7483436)
|
||||
Ghana (revision 7443354)
|
||||
Stredoafrická republika (revision 7480905)
|
||||
Zambia (revision 7450471)
|
||||
Deutsche Nationalbibliothek (revision 7093060)
|
||||
Library of Congress (revision 7440666)
|
||||
Všeobecná deklarácia ľudských práv (revision 7352266)
|
||||
.tf (revision 7409285)
|
||||
1808 (revision 6600752)
|
||||
Ostrov (revision 7486442)
|
||||
1992 (revision 7343700)
|
||||
Toulouse (revision 7336217)
|
||||
Irán (revision 7347645)
|
||||
Perejil (revision 5612466)
|
||||
Severný prieliv (revision 6454548)
|
||||
Starogréčtina (revision 6984822)
|
||||
Togo (revision 7443353)
|
||||
Dominikánska republika (revision 7361613)
|
||||
Andorra (mesto) (revision 6893271)
|
||||
Šelf (revision 7030011)
|
||||
Estónska koruna (revision 7423884)
|
||||
Karibik (revision 7151324)
|
||||
Karel Čapek (revision 7392452)
|
||||
Irak (revision 7416665)
|
||||
Island (revision 7444351)
|
||||
Estónsko (revision 7502711)
|
||||
Katalánčina (revision 7101045)
|
||||
Longyearbyen (revision 7016461)
|
||||
1972 (revision 7415648)
|
||||
Mešita Baňa Baši (revision 7148744)
|
||||
Európska organizácia pre jadrový výskum (revision 7289999)
|
||||
Volvo Group (revision 6124332)
|
||||
Maurícius (revision 7170186)
|
||||
Polis (revision 7020245)
|
||||
Guatemala (štát) (revision 7409908)
|
||||
Rybolov (revision 7104268)
|
||||
Administratívne členenie Španielska (revision 7126953)
|
||||
Medicína (revision 7358409)
|
||||
Storočie (revision 7446247)
|
||||
.sz (revision 7409282)
|
||||
ISO 3166-2:JO (revision 6547977)
|
||||
30. marec (revision 7344207)
|
||||
1497 (revision 5941683)
|
||||
Hišám Al-Karrúdž (revision 7463302)
|
||||
Vlajka Holandska (revision 7359148)
|
||||
Čile (revision 7334706)
|
||||
Francúzština (revision 7392820)
|
||||
Niger (revision 7453071)
|
||||
Prezident (revision 6855726)
|
||||
1557 (revision 7243476)
|
||||
Zoznam smerových telefónnych čísiel štátov (revision 7170973)
|
||||
Desaťročie (revision 6959661)
|
||||
Kodros (revision 6399114)
|
||||
Belize (revision 7435618)
|
||||
20. storočie (revision 6735798)
|
||||
Fidži (revision 7269015)
|
||||
Zoznam štátnych hymien (revision 7286529)
|
||||
Aun Schan Su Ťij (revision 7471093)
|
||||
Tunisko (revision 7449060)
|
||||
Medzinárodná organizácia pre normalizáciu (revision 6864058)
|
||||
Ústie (rieka, potok) (revision 7318511)
|
||||
Istanbul (revision 7419850)
|
||||
Manchester (revision 7276088)
|
||||
Jacques Chirac (revision 7461866)
|
||||
Kongo (Brazzaville) (revision 7475571)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:27:35.473786
|
||||
|
||||
67 characters appeared 919864 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char o: 8.871746258142508 %
|
||||
[ 1] Char a: 8.849569066731604 %
|
||||
[ 2] Char e: 7.74190532513502 %
|
||||
[ 3] Char n: 6.56053503561396 %
|
||||
[ 4] Char i: 5.963707678526391 %
|
||||
[ 5] Char r: 5.602241200873173 %
|
||||
[ 6] Char s: 4.921923240826905 %
|
||||
[ 7] Char t: 4.686018802779541 %
|
||||
[ 8] Char v: 4.35346964333858 %
|
||||
[ 9] Char k: 4.162354435003436 %
|
||||
[10] Char l: 3.749141177391441 %
|
||||
[11] Char m: 3.1474217927867603 %
|
||||
[12] Char d: 3.062844072602037 %
|
||||
[13] Char p: 2.783781080681492 %
|
||||
[14] Char u: 2.783020098623275 %
|
||||
[15] Char c: 2.470800031308976 %
|
||||
[16] Char h: 2.173690893436421 %
|
||||
[17] Char j: 2.0940052007688092 %
|
||||
[18] Char á: 2.0062748406286146 %
|
||||
[19] Char z: 1.8901707208891747 %
|
||||
[20] Char b: 1.618934972996008 %
|
||||
[21] Char y: 1.5046789525408104 %
|
||||
[22] Char ý: 1.0958141638329144 %
|
||||
[23] Char í: 1.0181939938947497 %
|
||||
[24] Char é: 0.9627510153674891 %
|
||||
[25] Char č: 0.9560108885661358 %
|
||||
[26] Char ú: 0.8374063992068392 %
|
||||
[27] Char š: 0.7816372855117713 %
|
||||
[28] Char g: 0.6546619935120843 %
|
||||
[29] Char ž: 0.6424862805806075 %
|
||||
[30] Char f: 0.45452371220093407 %
|
||||
[31] Char ľ: 0.38168685805727803 %
|
||||
[32] Char ť: 0.30145760677665395 %
|
||||
[33] Char ó: 0.2390570780028352 %
|
||||
[34] Char ä: 0.12730142716749432 %
|
||||
[35] Char ô: 0.12404007548942018 %
|
||||
[36] Char w: 0.09816668551003191 %
|
||||
[37] Char x: 0.08457772018472295 %
|
||||
[38] Char ď: 0.08446900846212048 %
|
||||
[39] Char ň: 0.08392544984910813 %
|
||||
[40] Char q: 0.012827983267091655 %
|
||||
[41] Char ĺ: 0.012610559821886714 %
|
||||
[42] Char ë: 0.008044667472582904 %
|
||||
[43] Char ŕ: 0.007718532304775489 %
|
||||
|
||||
The first 44 characters have an accumulated ratio of 0.9996760390666448.
|
||||
The first 6 characters have an accumulated ratio of 0.4358970456502265.
|
||||
All characters whose order is over 27 have an accumulated ratio of 0.03317555638659628.
|
||||
|
||||
1391 sequences found.
|
||||
|
||||
First 769 (typical positive ratio): 0.9950122209740424
|
||||
Next 247 (1016-769): 0.003989961669689679
|
||||
Rest: 0.000997817356267916
|
||||
|
||||
- Processing end: 2022-12-15 00:27:36.170101
|
||||
205
script/BuildLangModelLogs/LangSloveneModel.log
Normal file
205
script/BuildLangModelLogs/LangSloveneModel.log
Normal file
@ -0,0 +1,205 @@
|
||||
= Logs of language model for Slovene (sl) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:20:08.126858
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Ljubljana (revision 5845001)
|
||||
DJ Umek (revision 5769996)
|
||||
Oskrba s pitno vodo mesta Ljubljana (revision 5783653)
|
||||
Park Arturo Toscanini, Ljubljana (revision 4923825)
|
||||
Evgen Betetto (revision 5531253)
|
||||
Registracijske oznake za cestna vozila v Sloveniji (revision 5851310)
|
||||
Seznam slovenskih slikarjev (revision 5852050)
|
||||
Maribor (revision 5849582)
|
||||
Ilirsko kraljestvo (revision 5754790)
|
||||
Miroslav Cerar (revision 5508034)
|
||||
Wayback Machine (revision 5793624)
|
||||
Adam Bohorič (revision 5597667)
|
||||
Parma (Italija) (revision 5784213)
|
||||
Podtalnica (revision 5806060)
|
||||
Tadej Pogačar (revision 5852382)
|
||||
Stanko Bloudek (revision 5812856)
|
||||
Novo mesto (revision 5832728)
|
||||
Brod (revision 5797632)
|
||||
Seznam parkov v Ljubljani (revision 5841252)
|
||||
Bled (revision 5849206)
|
||||
Gimnastika na Poletnih olimpijskih igrah 1964 (revision 5773092)
|
||||
Slovenska glasba (revision 5838083)
|
||||
Internet Archive (revision 5350221)
|
||||
Gramofon (revision 5254280)
|
||||
Nemčija (revision 5851761)
|
||||
Zdenka Cerar (revision 5468687)
|
||||
Halštatska kultura (revision 5764080)
|
||||
Park (revision 5838021)
|
||||
Registrske tablice Bosne in Hercegovine (revision 5786827)
|
||||
Egipt (revision 5799694)
|
||||
Taborsko gibanje (revision 5211889)
|
||||
Zgodovina Slovenije od leta 1918 do leta 1945 (revision 5833911)
|
||||
35. šahovska olimpijada (revision 5100544)
|
||||
Šmartno pod Šmarno goro (revision 5560049)
|
||||
Kraljevina Jugoslavija (revision 5722769)
|
||||
Vižmarje (revision 5823732)
|
||||
Jesenice (revision 5801196)
|
||||
Nogometna zveza Slovenije (revision 5581253)
|
||||
München (revision 5848842)
|
||||
Gosposka ulica, Maribor (revision 4716831)
|
||||
Fedja Žbona (revision 5566289)
|
||||
Vipavski Križ (revision 5674644)
|
||||
Jezikoslovec (revision 5630564)
|
||||
Kisik (revision 5828458)
|
||||
Franc Jelovšek (revision 5478988)
|
||||
Šmarna gora (revision 5682956)
|
||||
12. februar (revision 5719793)
|
||||
Miroslav Cerar ml. (revision 5781280)
|
||||
Bradlja (revision 5797424)
|
||||
Jamova cesta, Ljubljana (revision 4939419)
|
||||
Italijani (revision 5693543)
|
||||
Goran Dragić (revision 5818154)
|
||||
Radio Gama MM (revision 5628380)
|
||||
Strežnik (revision 5169883)
|
||||
Nizozemska (revision 5834147)
|
||||
Nogometaš (revision 5813327)
|
||||
Štajerska (vojvodina) (revision 5567827)
|
||||
Peter Prevc (revision 5819270)
|
||||
Muzej Nadškofije Maribor (revision 5781830)
|
||||
Seznam cerkva v Sloveniji (revision 5788232)
|
||||
Sodni stolp, Maribor (revision 5275646)
|
||||
OSP (identifikator) (revision 5745096)
|
||||
Trove (identifikator) (revision 5791646)
|
||||
Občina Bohinj (revision 5849568)
|
||||
Rožna dolina, Ljubljana (revision 5831882)
|
||||
Oklepno bojno vozilo (revision 5845053)
|
||||
Regnum Carantanum (revision 4871988)
|
||||
Dravlje (revision 5852466)
|
||||
Rusija (revision 5844035)
|
||||
Elektro Maribor (revision 5771585)
|
||||
Ljutomer (revision 5804076)
|
||||
Policija (Slovenija) (revision 5759961)
|
||||
Bohoričica (revision 5810405)
|
||||
Temperatura (revision 5846949)
|
||||
Aljaž Pegan (revision 4791439)
|
||||
Seznam slovenskih filmov (revision 5850621)
|
||||
Python (programski jezik) (revision 5852283)
|
||||
Skladnja (revision 5847947)
|
||||
7. november (revision 5826321)
|
||||
SBI (identifikator) (revision 5744738)
|
||||
Slovenci (revision 5814723)
|
||||
Magdalena, Maribor (revision 5491961)
|
||||
Arturo Toscanini (revision 4746928)
|
||||
Mestna četrt Studenci (revision 5748733)
|
||||
Osvobodilna fronta (revision 5783709)
|
||||
Istra (revision 5801995)
|
||||
Gašper Porenta (revision 5615365)
|
||||
Elektronska glasba (revision 5688546)
|
||||
Sežana (revision 5851264)
|
||||
Avtorske pravice (revision 5468897)
|
||||
Kanada (revision 5835910)
|
||||
Maksim Gaspari (revision 5594007)
|
||||
Latinščina (revision 5366357)
|
||||
Cenzura (revision 5798182)
|
||||
Ljubljansko barje (revision 5778992)
|
||||
Dirigent (revision 5750263)
|
||||
Andrej Hauptman (revision 5657117)
|
||||
SUDOC (identifikator) (revision 5038792)
|
||||
Indija (revision 5802236)
|
||||
Jurij Dalmatin (revision 5820012)
|
||||
Hokej na ledu (revision 5832694)
|
||||
COBISS (revision 5574898)
|
||||
International Standard Book Number (revision 5780406)
|
||||
Dajnčica (revision 5463766)
|
||||
Primož Roglič (revision 5846873)
|
||||
Favna (revision 5832944)
|
||||
UEFA (revision 5809188)
|
||||
Goriška statistična regija (revision 5773409)
|
||||
Zwolle (revision 5845012)
|
||||
Mestna avtobusna linija št. 18 (Ljubljana) (revision 5778375)
|
||||
Slovenska ulica, Maribor (revision 5577876)
|
||||
Maistrov park, Ljubljana (revision 5581380)
|
||||
Olimpijsko dviganje uteži (revision 5763765)
|
||||
Seznam hrvaških dirigentov (revision 5804710)
|
||||
Janez Slapar (revision 5663199)
|
||||
Kraljevina Srbov, Hrvatov in Slovencev (revision 5625127)
|
||||
Koper (revision 5849793)
|
||||
6. maj (revision 5696428)
|
||||
Daniel Barenboim (revision 5628908)
|
||||
Soška fronta (revision 5810750)
|
||||
Tone Kralj (revision 5623334)
|
||||
ISNI (identifikator) (revision 5503117)
|
||||
1996 (revision 5600186)
|
||||
Občina Sveti Andraž v Slovenskih goricah (revision 5489980)
|
||||
Avstro-Ogrska (revision 5852400)
|
||||
29. oktober (revision 5451155)
|
||||
Helmut Recknagel (revision 4756293)
|
||||
Transsibirska železnica (revision 5815205)
|
||||
1876 (revision 5571931)
|
||||
Cirilica (revision 5846804)
|
||||
Gorica (revision 5801255)
|
||||
Stuttgart (revision 5689831)
|
||||
Praga (revision 5826163)
|
||||
Goražde (revision 5668077)
|
||||
Panonija (revision 5692925)
|
||||
Istroromuni (revision 5019989)
|
||||
Internet (revision 5843085)
|
||||
Rona (revision 5814383)
|
||||
Ferdinand de Saussure (revision 4867879)
|
||||
Motovun (revision 5684425)
|
||||
Matjaž Debelak (revision 5086562)
|
||||
Leopold Mandić (revision 5812676)
|
||||
Župnija Šmartno pod Šmarno goro (revision 5795515)
|
||||
Mungo Park (revision 5781803)
|
||||
Ignacij Žitnik (revision 5032848)
|
||||
Nacionalsocialistična nemška delavska stranka (revision 5747610)
|
||||
Henrik II. Sveti (revision 5811384)
|
||||
Tadej Sakelšek (revision 4799036)
|
||||
Vila Podrožnik (revision 5769727)
|
||||
Napoleon Bonaparte (revision 5829140)
|
||||
Irska (država) (revision 5830934)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:26:24.800748
|
||||
|
||||
54 characters appeared 1189504 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char o: 23.225983266975142 %
|
||||
[ 1] Char a: 8.511783062520177 %
|
||||
[ 2] Char e: 8.489000457333477 %
|
||||
[ 3] Char i: 7.719267862907565 %
|
||||
[ 4] Char n: 5.9588702518024315 %
|
||||
[ 5] Char r: 4.717176234800387 %
|
||||
[ 6] Char s: 4.307593753362746 %
|
||||
[ 7] Char l: 4.070772355536426 %
|
||||
[ 8] Char t: 3.8999448509630907 %
|
||||
[ 9] Char v: 3.6506812923706016 %
|
||||
[10] Char j: 3.5874616646938557 %
|
||||
[11] Char k: 3.206210319595394 %
|
||||
[12] Char d: 2.681117507801571 %
|
||||
[13] Char p: 2.552828742063919 %
|
||||
[14] Char m: 2.4061289411384914 %
|
||||
[15] Char u: 1.8266437103195956 %
|
||||
[16] Char z: 1.6809527332400733 %
|
||||
[17] Char b: 1.5614911761540946 %
|
||||
[18] Char g: 1.3392977240934036 %
|
||||
[19] Char h: 0.9866297212955989 %
|
||||
[20] Char č: 0.9708248143764123 %
|
||||
[21] Char c: 0.9144988163133541 %
|
||||
[22] Char š: 0.7542639621220273 %
|
||||
[23] Char ž: 0.46204132142472826 %
|
||||
[24] Char f: 0.31223098030775853 %
|
||||
|
||||
The first 25 characters have an accumulated ratio of 0.9979369552351233.
|
||||
The first 3 characters have an accumulated ratio of 0.402267667868288.
|
||||
All characters whose order is over 19 have an accumulated ratio of 0.03413859894544281.
|
||||
|
||||
892 sequences found.
|
||||
|
||||
First 425 (typical positive ratio): 0.9950503733060554
|
||||
Next 141 (566-425): 0.003955359577353934
|
||||
Rest: 0.0009942671165906747
|
||||
|
||||
- Processing end: 2022-12-15 00:26:25.414029
|
||||
251
script/BuildLangModelLogs/LangSpanishModel.log
Normal file
251
script/BuildLangModelLogs/LangSpanishModel.log
Normal file
@ -0,0 +1,251 @@
|
||||
= Logs of language model for Spanish (es) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:16:57.630923
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
España (revision 147846877)
|
||||
Puerto de Valencia (revision 147263018)
|
||||
Batalla de las Navas de Tolosa (revision 147899245)
|
||||
Gayo Valerio Catulo (revision 147034414)
|
||||
Rumanía (revision 147840376)
|
||||
Restauración borbónica en España (revision 147242761)
|
||||
Amadeo I de España (revision 147895776)
|
||||
Carretera (revision 146497439)
|
||||
Hispania romana (revision 147140924)
|
||||
Condado de Aragón (revision 147705941)
|
||||
Aprisio (revision 146343212)
|
||||
Expulsión de los moriscos (revision 147322617)
|
||||
Trajano (revision 147221352)
|
||||
Firme (revision 133295121)
|
||||
García Sánchez III de Navarra (revision 145340082)
|
||||
Galera (revision 145041703)
|
||||
Primera ocupación estadounidense de Cuba (revision 147668067)
|
||||
Ducado de Aosta (revision 142206759)
|
||||
Argentina (revision 147787947)
|
||||
Fermín Vázquez Huarte-Mendicoa (revision 143197376)
|
||||
Copa América (regata) (revision 145564517)
|
||||
Orfanato (revision 147534889)
|
||||
Sarmizegetusa (revision 144870783)
|
||||
Consulado de Mar (revision 146225617)
|
||||
Fernando II de Aragón (revision 147893455)
|
||||
Biblioteca del Estado Ruso (revision 146701651)
|
||||
Legua (revision 147859149)
|
||||
Analfabetismo (revision 147277991)
|
||||
Castilla-La Mancha (revision 147878187)
|
||||
Arabia Pétrea (revision 144133525)
|
||||
Fichero de Autoridades Virtual Internacional (revision 147484179)
|
||||
Navarra (revision 147823860)
|
||||
Figueras (revision 147279059)
|
||||
Alférez del reino de Aragón (revision 124825088)
|
||||
Epigrama (revision 146729622)
|
||||
Domínio Público (revision 140507910)
|
||||
Siglo VIII d. C. (revision 147188280)
|
||||
Bartolomé Barrientos (revision 145038377)
|
||||
Liga I (revision 144669044)
|
||||
Clodia (revision 145561796)
|
||||
Carreteras de Paraguay (revision 146992656)
|
||||
Sancho I de Cerdaña (revision 138982129)
|
||||
Economía de mercado (revision 147313599)
|
||||
José María Celleruelo (revision 145269367)
|
||||
Tomás II de Saboya (revision 142102478)
|
||||
Derecho romano (revision 147657302)
|
||||
Biblioteca Nacional de España (revision 147891240)
|
||||
Mineápolis (revision 147704011)
|
||||
ISBN (revision 147868478)
|
||||
Control de autoridades (revision 147901716)
|
||||
Mateo Alemán (revision 146145355)
|
||||
Ion Iliescu (revision 147019188)
|
||||
Realengo (revision 146773864)
|
||||
Cardenal Richelieu (revision 146771804)
|
||||
Imperio romano (revision 147904549)
|
||||
Legión romana (revision 145800436)
|
||||
Valle de Hecho (revision 147907860)
|
||||
Reinos cristianos peninsulares medievales (revision 147343815)
|
||||
Basílica de Superga (revision 145927850)
|
||||
Valle del Aragón (revision 119483947)
|
||||
Monasterio de San Adrián de Sásave (revision 147892813)
|
||||
Carreteras de Costa Rica (revision 143524768)
|
||||
1885 (revision 146584859)
|
||||
Alfonso IX de León (revision 147627974)
|
||||
Bernardo IV de Cominges (revision 114887808)
|
||||
Cortes Generales (revision 147873038)
|
||||
Mureș (distrito) (revision 134462528)
|
||||
María Luisa de Borbón (revision 147020083)
|
||||
Batalla de Pancorbo (revision 126434305)
|
||||
Marco Asinio Marcelo el Joven (revision 145460206)
|
||||
Danubio (revision 147663654)
|
||||
Japón (revision 147908194)
|
||||
Proyecto Perseus (revision 143471495)
|
||||
1883 (revision 147067746)
|
||||
Dacia (revision 145103904)
|
||||
Vino (revision 147117191)
|
||||
Bitinia (revision 145480241)
|
||||
Turdetania (revision 146975909)
|
||||
Camil Petrescu (revision 140060244)
|
||||
Navarroaragonés (revision 147720554)
|
||||
Soldados de cuera en la Nueva España (revision 130807692)
|
||||
Sistema de coordenadas (revision 146022910)
|
||||
Traian Vuia (revision 104701860)
|
||||
Ganadería (revision 147821896)
|
||||
Marcas viales (revision 146856509)
|
||||
Wayback Machine (revision 147380387)
|
||||
Taifa (revision 147654597)
|
||||
Península ibérica (revision 147829469)
|
||||
Cuadernos de Historia de España (revision 139000336)
|
||||
Senado de España (revision 147197812)
|
||||
Embarque del rey Amadeo en el puerto de La Spezia, Italia (revision 117322558)
|
||||
Corona de Aragón (revision 147918234)
|
||||
Arabia Petraea (revision 144133525)
|
||||
Lírica (revision 146917867)
|
||||
Reconquista (revision 147927548)
|
||||
Gaspar Ibáñez de Segovia (revision 130716508)
|
||||
Jacinto Higueras Fuentes (revision 139784283)
|
||||
Roma (revision 147103514)
|
||||
El coloquio de los perros (revision 143068547)
|
||||
Julio César (revision 147630019)
|
||||
Europa (revision 147625494)
|
||||
Cabezo de Alcalá (revision 147755100)
|
||||
Batalla de Guadalacete (revision 146354062)
|
||||
Istituto Centrale per il Catalogo Unico (revision 115809446)
|
||||
Italia (época romana) (revision 147396271)
|
||||
Inquisidor general (revision 144876852)
|
||||
Eneida (revision 147423295)
|
||||
Biblioteca Nacional de la República Checa (revision 139921933)
|
||||
Batalla de Los Alporchones (revision 124421073)
|
||||
Kilómetro cuadrado (revision 147523192)
|
||||
Frente de Salvación Nacional (Rumanía) (revision 125710058)
|
||||
Bética (revision 147107544)
|
||||
Autoridad Portuaria (revision 123820617)
|
||||
19 a. C. (revision 129514951)
|
||||
República de Salé (revision 147293137)
|
||||
Regencia de María Cristina de Habsburgo (revision 144125085)
|
||||
Siderurgia (revision 147648595)
|
||||
Calle del Arenal (revision 135013775)
|
||||
Dacia (provincia romana) (revision 145375850)
|
||||
Brăila (distrito) (revision 118960625)
|
||||
Eulogio de Córdoba (revision 147461878)
|
||||
Carlos V (1898) (revision 147728618)
|
||||
David Chipperfield (revision 147744851)
|
||||
Reino de Piamonte-Cerdeña (revision 147794353)
|
||||
Pentecostés (revision 146565213)
|
||||
Autopista de peaje (revision 146064321)
|
||||
Juan Manuel Rodríguez Tobal (revision 147929976)
|
||||
Lisboa (revision 146533001)
|
||||
Tierra de Campos (revision 147470227)
|
||||
Fernando III de Toscana (revision 146980042)
|
||||
Susa (revision 147294207)
|
||||
Julio Caro Baroja (revision 147721448)
|
||||
Guadiana (revision 147757644)
|
||||
Modernismo catalán (revision 146462730)
|
||||
Juan García Margallo (revision 142477532)
|
||||
Transporte fluvial (revision 147205941)
|
||||
Latín (revision 147910997)
|
||||
Rin (revision 147465318)
|
||||
Pompeyo (revision 147631801)
|
||||
Ur (revision 147802269)
|
||||
Hexámetro (revision 146332360)
|
||||
Carl Orff (revision 147005923)
|
||||
Tratado germano-español (1899) (revision 146016292)
|
||||
Asturias (revision 147837987)
|
||||
Autoridad Portuaria de Almería (revision 145172909)
|
||||
Jaca (revision 147907824)
|
||||
Catecismo (revision 140585625)
|
||||
Idioma español (revision 147930055)
|
||||
Catulo (revision 147034414)
|
||||
Évariste Lévi-Provençal (revision 139972364)
|
||||
Prefectura de Ōita (revision 145817808)
|
||||
Bosra (revision 137685821)
|
||||
Carabela (revision 146833375)
|
||||
Vikingos (revision 147579555)
|
||||
Real Academia de la Lengua Vasca (revision 147652877)
|
||||
Brother Ali (revision 145114820)
|
||||
Reino de León (revision 147927670)
|
||||
Día de la Ascensión (revision 143846680)
|
||||
Asedio de Sarmizegetusa (106) (revision 133263630)
|
||||
Manuel Carrión Gútiez (revision 147904740)
|
||||
Tomás Estrada Palma (revision 147403804)
|
||||
Condado de Provenza (revision 140629566)
|
||||
Bután (revision 147766188)
|
||||
Lugal (revision 134382903)
|
||||
Taifa de Baeza (revision 146178470)
|
||||
Ostia (revision 147808627)
|
||||
Prueba (derecho) (revision 147057802)
|
||||
Milán (revision 147681342)
|
||||
Antigua Roma (revision 147904658)
|
||||
Alejandro Magno (revision 147694927)
|
||||
Deutsche Biographie (revision 145290662)
|
||||
Arauca (Colombia) (revision 147637823)
|
||||
Jerónimo Castillón y Salas (revision 146457531)
|
||||
Dinero fiduciario (revision 146893236)
|
||||
Enciclopedia Británica (revision 147744092)
|
||||
Gemeinsame Normdatei (revision 146776905)
|
||||
Años 10 (revision 136223252)
|
||||
Biblioteca Pública de Minneapolis (revision 138446442)
|
||||
Túrdulos (revision 135552964)
|
||||
Mariano Benlliure (revision 145839577)
|
||||
Conde de Tolosa (revision 147283581)
|
||||
Casa de Luna (revision 145401384)
|
||||
Primate (revision 146080213)
|
||||
2003 (revision 147862656)
|
||||
Montes de Toledo (revision 146840694)
|
||||
Aísa (revision 147075147)
|
||||
Sibiu (distrito) (revision 146095710)
|
||||
Estrabón (revision 146155215)
|
||||
Región (revision 147836172)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:20:04.575097
|
||||
|
||||
60 characters appeared 2948807 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 12.481250892310008 %
|
||||
[ 1] Char a: 12.094619959868517 %
|
||||
[ 2] Char o: 8.30084844481175 %
|
||||
[ 3] Char n: 7.189382011098047 %
|
||||
[ 4] Char s: 7.165033181215319 %
|
||||
[ 5] Char i: 6.924766524224882 %
|
||||
[ 6] Char r: 6.661473606105791 %
|
||||
[ 7] Char l: 5.981537618433489 %
|
||||
[ 8] Char d: 5.191082359747519 %
|
||||
[ 9] Char c: 4.555808501539775 %
|
||||
[10] Char t: 4.496428555683705 %
|
||||
[11] Char u: 3.6601242468564408 %
|
||||
[12] Char m: 2.7249664016668436 %
|
||||
[13] Char p: 2.5307861789530475 %
|
||||
[14] Char b: 1.3648909542062264 %
|
||||
[15] Char g: 1.289301063107894 %
|
||||
[16] Char v: 0.9662890789393813 %
|
||||
[17] Char y: 0.8741162103860984 %
|
||||
[18] Char f: 0.8234855655185301 %
|
||||
[19] Char ó: 0.8032400899753698 %
|
||||
[20] Char h: 0.6518229236433581 %
|
||||
[21] Char í: 0.5769451849510667 %
|
||||
[22] Char q: 0.56327864115895 %
|
||||
[23] Char á: 0.3847318593587169 %
|
||||
[24] Char j: 0.3699123069092009 %
|
||||
[25] Char z: 0.35848395639321257 %
|
||||
[26] Char é: 0.29174510234138756 %
|
||||
[27] Char x: 0.2329755728333526 %
|
||||
[28] Char ñ: 0.20214954725758585 %
|
||||
[29] Char ú: 0.12805178501000575 %
|
||||
[30] Char k: 0.09152854018591247 %
|
||||
[31] Char w: 0.04032139098964429 %
|
||||
[32] Char ü: 0.011157054361306115 %
|
||||
|
||||
The first 33 characters have an accumulated ratio of 0.9998253531004235.
|
||||
The first 4 characters have an accumulated ratio of 0.4006610130808832.
|
||||
All characters whose order is over 20 have an accumulated ratio of 0.03251280941750342.
|
||||
|
||||
1139 sequences found.
|
||||
|
||||
First 446 (typical positive ratio): 0.9950161077973593
|
||||
Next 174 (620-446): 0.0039865210967695575
|
||||
Rest: 0.0009973711058711698
|
||||
|
||||
- Processing end: 2022-12-15 00:20:04.696283
|
||||
253
script/BuildLangModelLogs/LangSwedishModel.log
Normal file
253
script/BuildLangModelLogs/LangSwedishModel.log
Normal file
@ -0,0 +1,253 @@
|
||||
= Logs of language model for Swedish (sv) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:17:21.940825
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Kakapo (revision 49828871)
|
||||
Integrated Taxonomic Information System (revision 48591706)
|
||||
Ryggradsdjur (revision 51433096)
|
||||
Frukter (revision 50447960)
|
||||
Jordbruk (revision 50783223)
|
||||
Hjortdjur (revision 51593592)
|
||||
Frö (revision 50122699)
|
||||
George Robert Gray (revision 51407311)
|
||||
Näbb (revision 50932877)
|
||||
Pollen (revision 49316911)
|
||||
Splintved (revision 24774027)
|
||||
Evolution (revision 51556245)
|
||||
Trabekler (revision 40627327)
|
||||
Molekylär klocka (revision 47887818)
|
||||
Gemeinsame Normdatei (revision 46103091)
|
||||
Markavvattning (revision 45999859)
|
||||
Tjeckiska nationalbiblioteket (revision 46514905)
|
||||
Engelska (revision 51315874)
|
||||
Skog (revision 51604140)
|
||||
Erosion (revision 50172896)
|
||||
Biodynamiskt jordbruk (revision 51110008)
|
||||
Naturkatastrof (revision 51624603)
|
||||
Offentlig handling (revision 50850574)
|
||||
Acanthodii (revision 50085495)
|
||||
Gametofyt (revision 49172013)
|
||||
Nordamerika (revision 51280439)
|
||||
Ben (skelett) (revision 47443011)
|
||||
Jordbruksredskap (revision 49419310)
|
||||
Paleognata fåglar (revision 49265534)
|
||||
Bomull (revision 51005901)
|
||||
Sydamerika (revision 51602843)
|
||||
Tvåsäde (revision 49563111)
|
||||
Kaka (revision 50808589)
|
||||
Ornitolog (revision 50575500)
|
||||
Cellulär mikrobiologi (revision 40081943)
|
||||
Däggdjur (revision 51188513)
|
||||
Vitläppad hjort (revision 51520475)
|
||||
Vall (jordbruk) (revision 49990402)
|
||||
WoRMS (revision 47891148)
|
||||
Allergen (revision 44738256)
|
||||
Träsk (revision 51507101)
|
||||
Catalogue of Life (revision 51324126)
|
||||
Korspollinering (revision 49203596)
|
||||
Åsna (revision 51145621)
|
||||
Opium (revision 49391458)
|
||||
Pastoralism (revision 46456106)
|
||||
Polygyni (revision 49883288)
|
||||
Entomologi (revision 51271587)
|
||||
Portugisiska (revision 51529880)
|
||||
National Library of Australia (revision 48833796)
|
||||
Environmental Protection Agency (revision 50404069)
|
||||
Sediment (geologi) (revision 51050878)
|
||||
Australien (revision 51642934)
|
||||
Federalism (revision 51220233)
|
||||
Sköldpaddor (revision 50621871)
|
||||
Sankt Hubertus (revision 33100409)
|
||||
Selektionstryck (revision 39929085)
|
||||
Eurasien (revision 50423610)
|
||||
Trofim Lysenko (revision 50325565)
|
||||
International Standard Name Identifier (revision 48798571)
|
||||
Domesticerad (revision 50257550)
|
||||
Biofysik (revision 51551479)
|
||||
Överhud (revision 49716509)
|
||||
Ekosystem (revision 51621713)
|
||||
Spaniens nationalbibliotek (revision 50524680)
|
||||
Självpollinering (revision 50794518)
|
||||
6 maj (revision 50524196)
|
||||
Blåsfiskar (revision 50222146)
|
||||
Broskfiskar (revision 50455230)
|
||||
Royal Society (revision 50241446)
|
||||
Barrträd (revision 50042026)
|
||||
Rovfåglar (revision 46026584)
|
||||
Större korsnäbb (revision 51530409)
|
||||
Datering (revision 50458901)
|
||||
Yoghurt (revision 51184993)
|
||||
Genflöde (revision 49493634)
|
||||
Auktoritetsdata (revision 50073323)
|
||||
Starar (revision 51460916)
|
||||
Library of Congress Control Number (revision 46514899)
|
||||
Starkt hotad (revision 51148334)
|
||||
Calamianhjort (revision 50657405)
|
||||
Papegojfåglar (revision 50180477)
|
||||
Källmaterial (revision 49256255)
|
||||
Davidshjort (revision 49672175)
|
||||
Palynologi (revision 49558278)
|
||||
Julian Huxley (revision 50501056)
|
||||
Den själviska genen (revision 49412209)
|
||||
Gödsel (revision 49711703)
|
||||
Ståndare (revision 46462001)
|
||||
Biodiversitetsinformatik (revision 47810882)
|
||||
Transhumans (revision 50170038)
|
||||
Blomma (revision 51638463)
|
||||
Frukt (revision 50447960)
|
||||
Skogsjordbruk (revision 49347881)
|
||||
Auktorsnamn (revision 51253351)
|
||||
Jordbruksnäring (revision 47416550)
|
||||
Juvenil (revision 51610324)
|
||||
Georges Louis Leclerc de Buffon (revision 51161247)
|
||||
Miljöförstöring (revision 50620440)
|
||||
Bin (revision 49898091)
|
||||
Ornitologi (revision 50575500)
|
||||
Fruktkorg (revision 49985978)
|
||||
Chelsea, London (revision 45391421)
|
||||
Citrusmullbär (revision 47780335)
|
||||
Labbar (revision 47662463)
|
||||
Fåglar (revision 51631929)
|
||||
Louis Agassiz (revision 51312350)
|
||||
Solbad (revision 49714623)
|
||||
Blomväxt (revision 49082614)
|
||||
Boskapsskötsel (revision 51553022)
|
||||
Groddjur (revision 51562615)
|
||||
Ryggsträng (revision 47677207)
|
||||
Art (revision 51144100)
|
||||
Slaga (revision 51096096)
|
||||
711 (revision 46456309)
|
||||
Krokodilfarm (revision 20464285)
|
||||
Hudcancer (revision 51602648)
|
||||
Skära (revision 50510590)
|
||||
Carl von Linné (revision 51625705)
|
||||
Jasper Becker (revision 51366138)
|
||||
Tony Blair (revision 51547588)
|
||||
Färöarnas nationalbibliotek (revision 42918932)
|
||||
Malaria (revision 51411903)
|
||||
Nationalbiblioteket (revision 50524399)
|
||||
Insekter (revision 51005363)
|
||||
Ao Nang (revision 51097197)
|
||||
Pälsdjursuppfödning (revision 50258398)
|
||||
Lista över auktorer inom fågeltaxonomin (revision 38750042)
|
||||
Tjocknäbbad stare (revision 50428929)
|
||||
Mexiko (revision 51606982)
|
||||
Melanin (revision 51324379)
|
||||
1898 (revision 51111140)
|
||||
Fylum (revision 48212330)
|
||||
Rike (biologi) (revision 50937218)
|
||||
Global Biodiversity Information Facility (revision 51445943)
|
||||
Kvantbiologi (revision 50614621)
|
||||
Växeljordbruk (revision 49057044)
|
||||
Storhuvudsköldpaddor (revision 50104788)
|
||||
Särvsläktet (revision 49277312)
|
||||
John Ray (revision 51315235)
|
||||
Käkben (revision 49958717)
|
||||
1956 (revision 51150741)
|
||||
Barbados (revision 51615572)
|
||||
Organism (revision 51537725)
|
||||
SKUD (revision 49136322)
|
||||
Jordbävningen i Sichuan 2008 (revision 49297824)
|
||||
Slåtterblomma (revision 51099474)
|
||||
Tyska (revision 51610484)
|
||||
Devon (period) (revision 48663164)
|
||||
Klass (biologi) (revision 44944834)
|
||||
Russian State Library (revision 50564132)
|
||||
Terrass (jordbruk) (revision 39460865)
|
||||
Manater (revision 49271359)
|
||||
Anders Jahan Retzius (revision 49982478)
|
||||
Olivträd (revision 49440211)
|
||||
Orectolobiformes (revision 51584388)
|
||||
Kunskapsbrist (revision 51149826)
|
||||
Chinchilla (revision 51623767)
|
||||
Värmland (revision 51562118)
|
||||
Biblioteca Nacional de Portugal (revision 42919564)
|
||||
Frankrike (revision 51612697)
|
||||
Stratigrafisk utbredning (revision 48370412)
|
||||
Öken (revision 50057233)
|
||||
Fossilworks (revision 51334868)
|
||||
Internet Archive (revision 51051535)
|
||||
Självreferens (revision 51522468)
|
||||
Smålånke (revision 48621114)
|
||||
Dator (revision 51220516)
|
||||
Ekologi (revision 51390308)
|
||||
Pastor (revision 47140943)
|
||||
Kalender (revision 49440631)
|
||||
Kommunalt bolag (revision 50622319)
|
||||
Biodling (revision 49199502)
|
||||
Nasjonalbiblioteket (revision 50524727)
|
||||
Genetik (revision 51210350)
|
||||
Makt (revision 51641654)
|
||||
Biotop (revision 51475637)
|
||||
Skandinavien (revision 51214754)
|
||||
Japan (revision 51615621)
|
||||
Cellbiologi (revision 49256694)
|
||||
Jak (revision 51050149)
|
||||
Ämnesord (revision 48272100)
|
||||
Rod Laver (revision 49666673)
|
||||
Narkotin (revision 43983774)
|
||||
Åkerbruk (revision 50783223)
|
||||
Encyclopedia of Life (revision 49580226)
|
||||
Charles Dudley Warner (revision 50246517)
|
||||
Bekämpningsmedel (revision 49196587)
|
||||
Plommon (revision 49642585)
|
||||
São Paulo (revision 51580808)
|
||||
Översättning (revision 51189617)
|
||||
Stjärtlösa groddjur (revision 51636774)
|
||||
Tunnel (revision 51436951)
|
||||
Biel (revision 51097266)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:20:31.413070
|
||||
|
||||
66 characters appeared 1023715 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char e: 9.949546504642406 %
|
||||
[ 1] Char a: 9.573856004845098 %
|
||||
[ 2] Char r: 8.879912866373942 %
|
||||
[ 3] Char n: 8.409371748973102 %
|
||||
[ 4] Char t: 7.424625017705123 %
|
||||
[ 5] Char s: 6.303609891424859 %
|
||||
[ 6] Char i: 6.043478897935461 %
|
||||
[ 7] Char l: 5.368877080046692 %
|
||||
[ 8] Char o: 4.814621256892788 %
|
||||
[ 9] Char d: 4.46276551579297 %
|
||||
[10] Char m: 3.3543515529224446 %
|
||||
[11] Char k: 3.3218229683066083 %
|
||||
[12] Char g: 2.833308098445368 %
|
||||
[13] Char v: 2.3090410905378937 %
|
||||
[14] Char u: 2.02556375553743 %
|
||||
[15] Char f: 2.0134510093141156 %
|
||||
[16] Char p: 1.9067806957991238 %
|
||||
[17] Char h: 1.8919328133318356 %
|
||||
[18] Char ä: 1.8456308640588444 %
|
||||
[19] Char c: 1.4756060036240555 %
|
||||
[20] Char b: 1.419242660310731 %
|
||||
[21] Char å: 1.257381204729832 %
|
||||
[22] Char ö: 1.2369653663373106 %
|
||||
[23] Char j: 0.674015717265059 %
|
||||
[24] Char y: 0.6544790298081009 %
|
||||
[25] Char x: 0.23580781760548591 %
|
||||
[26] Char w: 0.12093209535857148 %
|
||||
[27] Char z: 0.06622937047908842 %
|
||||
[28] Char é: 0.05861006237087471 %
|
||||
[29] Char q: 0.01738765183669283 %
|
||||
|
||||
The first 30 characters have an accumulated ratio of 0.999492046126119.
|
||||
The first 5 characters have an accumulated ratio of 0.4423731214253967.
|
||||
All characters whose order is over 21 have an accumulated ratio of 0.03064427111061184.
|
||||
|
||||
1083 sequences found.
|
||||
|
||||
First 502 (typical positive ratio): 0.9950552774288863
|
||||
Next 173 (675-502): 0.003945597397539302
|
||||
Rest: 0.0009991251735743667
|
||||
|
||||
- Processing end: 2022-12-15 00:20:31.562821
|
||||
233
script/BuildLangModelLogs/LangThaiModel.log
Normal file
233
script/BuildLangModelLogs/LangThaiModel.log
Normal file
@ -0,0 +1,233 @@
|
||||
= Logs of language model for Thai (th) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:20:40.435765
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
หน้าหลัก (revision 9904032)
|
||||
สาธารณรัฐจีน (revision 10416930)
|
||||
ธงชาติเอสโตเนีย (revision 10179603)
|
||||
ประชากรโลก (revision 10416810)
|
||||
คนญี่ปุ่น (revision 9865034)
|
||||
การอักเสบ (revision 10108520)
|
||||
เจียง เจ๋อหมิน (revision 10459105)
|
||||
เดลี (revision 10005684)
|
||||
ทาลลินน์ (revision 10365090)
|
||||
แมโครไลด์ (revision 10250756)
|
||||
พรรคนาซี (revision 10397981)
|
||||
นาซา (revision 10421933)
|
||||
รายชื่อประเทศเรียงตามความหนาแน่นประชากร (revision 9864334)
|
||||
ไทกีไซคลีน (revision 10469110)
|
||||
ธงชาติจอร์เจีย (revision 10357912)
|
||||
ศรีลังกา (revision 10388909)
|
||||
อุตตรประเทศ (revision 10151167)
|
||||
ค.ศ. 1981 (revision 10216412)
|
||||
ไอล์ออฟแมน (revision 10287539)
|
||||
ผู้หญิง (revision 10358580)
|
||||
พ.ศ. 2488 (revision 10439390)
|
||||
สีดำ (revision 10145506)
|
||||
สตาวังเงอร์ (revision 10098117)
|
||||
เลขาธิการพรรคคอมมิวนิสต์จีน (revision 10311137)
|
||||
บอร์เนน (revision 9371312)
|
||||
ประเทศเยอรมนี (revision 10435832)
|
||||
ธงชาติโมนาโก (revision 10377768)
|
||||
เซี่ยงไฮ้ (revision 10069809)
|
||||
มาเลเซีย (revision 10434631)
|
||||
ฟลอเรนซ์ (revision 10216249)
|
||||
สหราชอาณาจักร (revision 10440822)
|
||||
International Monetary Fund (revision 10182310)
|
||||
ประชากร (revision 9907304)
|
||||
สัทอักษรสากล (revision 10135934)
|
||||
ไนจีเรีย (revision 9842110)
|
||||
เติ้งเสี่ยวผิง (revision 10347197)
|
||||
เรคยาวิก (revision 10218000)
|
||||
ทวีปแอฟริกา (revision 10433059)
|
||||
เยอรมนี (revision 10435832)
|
||||
Government of India (revision 9042357)
|
||||
ดาวเทียมสปุตนิค 1 (revision 10015824)
|
||||
โมนาโก (revision 10373340)
|
||||
แบดมินตัน (revision 10235522)
|
||||
สันนิบาติชาติสังคมนิยมแห่งไรซ์สำหรับการออกกำลังทางกายภาพ (revision 10297912)
|
||||
ค.ศ. 1975 (revision 10127987)
|
||||
Encarta (revision 10246589)
|
||||
มิถุนายน (revision 9444949)
|
||||
อินโดนีเซีย (revision 10435811)
|
||||
Central Intelligence Agency (revision 10246925)
|
||||
บังกลาเทศ (revision 10476965)
|
||||
ธงชาติอาเซอร์ไบจาน (revision 9569706)
|
||||
วิทยาเซรุ่ม (revision 10324917)
|
||||
อียิปต์ (revision 10467051)
|
||||
ธงชาติสาธารณรัฐสังคมนิยมโซเวียตเอสโตเนีย (revision 8471180)
|
||||
อักษรโรมัน (revision 10279083)
|
||||
Literacy in India (revision 10430004)
|
||||
ปารากวัย (revision 10086617)
|
||||
ฝรั่งเศส (revision 10470508)
|
||||
แพฟอส (revision 9443297)
|
||||
รายชื่อประธานาธิบดีสาธารณรัฐประชาชนจีน (revision 9839306)
|
||||
การคว่ำบาตรธุรกิจชาวยิวของนาซี (revision 8140635)
|
||||
โบลิเวีย (revision 10086745)
|
||||
พ.ศ. 2545 (revision 10363737)
|
||||
ทวีปยุโรป (revision 10454257)
|
||||
สิงคโปร์ (revision 10413782)
|
||||
ลักเซมเบิร์ก (revision 10435795)
|
||||
สงครามโลกครั้งที่สอง (revision 10451166)
|
||||
โรคท่อเลือดแดงและหลอดเลือดแดงแข็ง (revision 9907967)
|
||||
หู จิ่นเทา (revision 10428652)
|
||||
ทวีปอเมริกาเหนือ (revision 10418313)
|
||||
อักษรจีนตัวย่อ (revision 9702441)
|
||||
22 กันยายน (revision 10469604)
|
||||
ธงชาติทรานส์นีสเตรีย (revision 9569668)
|
||||
ริเยกา (revision 10297085)
|
||||
ประเทศอินเดีย (revision 10435810)
|
||||
ซินจู๋ (revision 10443351)
|
||||
พยาธิกายวิภาค (revision 5458586)
|
||||
ประเทศแคนาดา (revision 10433247)
|
||||
Coagulative necrosis (revision 7462805)
|
||||
เม็กซิโก (revision 10435830)
|
||||
การต่อต้านยิว (revision 10096394)
|
||||
มหาอำนาจกลาง (revision 10194476)
|
||||
ชาวยิว (revision 10209235)
|
||||
สหรัฐ (revision 10452486)
|
||||
ไทย (revision 10479597)
|
||||
กองทุนประชากรแห่งสหประชาชาติ (revision 10049856)
|
||||
ธงชาติยูเครน (revision 10310962)
|
||||
พินอิน (revision 10344015)
|
||||
ประธานาธิบดีไต้หวัน (revision 10470458)
|
||||
เมกะซิตี (revision 10278263)
|
||||
ออกซาซิลลิน (revision 9349240)
|
||||
เดมะนิม (revision 9117585)
|
||||
นิวแคลิโดเนีย (revision 9854958)
|
||||
สกู๊ตเตอร์ (จักรยานยนต์) (revision 9667079)
|
||||
ISBN (identifier) (revision 10474803)
|
||||
พรรคแอร์โรว์ครอสส์ (revision 9330061)
|
||||
ออร์ฮูส (revision 8961150)
|
||||
ภาษาแต้จิ๋ว (revision 10373510)
|
||||
เวอร์นอน แอล สมิธ (revision 10319937)
|
||||
โกตดาซูร์ (revision 10067696)
|
||||
ประเทศคอสตาริกา (revision 10470553)
|
||||
กอลเวย์ (revision 10347140)
|
||||
ธงชาตินอร์เทิร์นไอร์แลนด์ (revision 9119794)
|
||||
สันนิบาตชาติ (revision 10365681)
|
||||
อักษรตงปา (revision 9353571)
|
||||
โทคิโอโฮเทล (revision 10237521)
|
||||
การตายเฉพาะส่วน (revision 10149625)
|
||||
ประเทศเซเนกัล (revision 9763478)
|
||||
อับฮาเซีย (revision 10393097)
|
||||
สัมประสิทธิ์จีนี (revision 9672933)
|
||||
ญี่ปุ่น (revision 10454334)
|
||||
ชัมมูและกัศมีร์ (ดินแดนสหภาพ) (revision 10236177)
|
||||
4 กุมภาพันธ์ (revision 10267773)
|
||||
ไมโครซอฟท์ (revision 10469284)
|
||||
สีไซอัน (revision 8689665)
|
||||
ซาเกร็บ (revision 10352620)
|
||||
ทะเลเมดิเตอร์เรเนียน (revision 10069175)
|
||||
เกรกอร์ ชตรัสเซอร์ (revision 10167753)
|
||||
อลัน ทัวริง (revision 10436603)
|
||||
เนเธอร์แลนด์ (revision 10356667)
|
||||
ธงชาติไอซ์แลนด์ (revision 9569769)
|
||||
ธงชาติสวีเดน (revision 9029892)
|
||||
ลิแวนต์ (revision 8766864)
|
||||
มุมไบ (revision 10460080)
|
||||
สหภาพแอฟริกา (revision 10037662)
|
||||
สิงหาคม (revision 10267979)
|
||||
กระทรวงเศรษฐกิจและพลังงานสหพันธ์ (revision 9711813)
|
||||
นิติเวชคลินิก (revision 4249372)
|
||||
อันชลุสส์ (revision 9966952)
|
||||
อักษรซาบาเอียน (revision 1799108)
|
||||
สารานุกรม (revision 10359896)
|
||||
จอร์เจีย (revision 10466198)
|
||||
ประเทศเบลีซ (revision 10356670)
|
||||
ธงชาติญี่ปุ่น (revision 10123028)
|
||||
แรน (revision 9688385)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:23:34.344450
|
||||
|
||||
69 characters appeared 1063822 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char า: 7.033695486650962 %
|
||||
[ 1] Char น: 6.100926658783142 %
|
||||
[ 2] Char ร: 5.938117467019858 %
|
||||
[ 3] Char ก: 4.492104882207737 %
|
||||
[ 4] Char อ: 4.258419171628336 %
|
||||
[ 5] Char เ: 3.990141207833641 %
|
||||
[ 6] Char ง: 3.8734863539201108 %
|
||||
[ 7] Char ่: 3.8162399348763234 %
|
||||
[ 8] Char ั: 3.4642073579978603 %
|
||||
[ 9] Char ม: 3.4400491811600062 %
|
||||
[10] Char ี: 2.9371454999050592 %
|
||||
[11] Char ย: 2.802630515255372 %
|
||||
[12] Char ล: 2.7274299647873423 %
|
||||
[13] Char ้: 2.4708080863151918 %
|
||||
[14] Char ิ: 2.3827294415795124 %
|
||||
[15] Char ว: 2.3490771952450693 %
|
||||
[16] Char ท: 2.2903267651919212 %
|
||||
[17] Char ส: 2.267390597299172 %
|
||||
[18] Char ต: 2.224150280780055 %
|
||||
[19] Char ป: 2.1304315947592736 %
|
||||
[20] Char ด: 2.1304315947592736 %
|
||||
[21] Char ะ: 2.074783187412932 %
|
||||
[22] Char ค: 1.6853383366766246 %
|
||||
[23] Char แ: 1.6731182472255697 %
|
||||
[24] Char บ: 1.6530961006634568 %
|
||||
[25] Char ห: 1.6071297641898736 %
|
||||
[26] Char จ: 1.1870406891378444 %
|
||||
[27] Char ใ: 1.1792386320267865 %
|
||||
[28] Char ช: 1.1505684221608503 %
|
||||
[29] Char ข: 1.0856139466940897 %
|
||||
[30] Char พ: 1.0046793542528731 %
|
||||
[31] Char ์: 0.9819311877362942 %
|
||||
[32] Char ุ: 0.9342728388771806 %
|
||||
[33] Char ื: 0.9291028010325035 %
|
||||
[34] Char ศ: 0.9121826771771969 %
|
||||
[35] Char โ: 0.8589782877210661 %
|
||||
[36] Char ไ: 0.8107559347334422 %
|
||||
[37] Char ็: 0.782367726931761 %
|
||||
[38] Char ู: 0.7527575101849746 %
|
||||
[39] Char ำ: 0.5811122537416974 %
|
||||
[40] Char ซ: 0.5105177369898348 %
|
||||
[41] Char ึ: 0.5069457108426034 %
|
||||
[42] Char ธ: 0.45674934340519374 %
|
||||
[43] Char ษ: 0.44744327528477507 %
|
||||
[44] Char ภ: 0.44152123193541776 %
|
||||
[45] Char ถ: 0.38446281426780043 %
|
||||
[46] Char ญ: 0.37243072619291573 %
|
||||
[47] Char ณ: 0.36209065050356165 %
|
||||
[48] Char ฐ: 0.35842462366824523 %
|
||||
[49] Char ผ: 0.3104842727448765 %
|
||||
[50] Char ฟ: 0.18057532181135566 %
|
||||
[51] Char ฝ: 0.12389290689607847 %
|
||||
[52] Char ฤ: 0.09494069496588715 %
|
||||
[53] Char ฮ: 0.08394261445993785 %
|
||||
[54] Char ๆ: 0.07426054358717907 %
|
||||
[55] Char ฉ: 0.07397854152292395 %
|
||||
[56] Char ฒ: 0.06222845551229435 %
|
||||
[57] Char ฏ: 0.04916235986847423 %
|
||||
[58] Char ฎ: 0.04709434473060343 %
|
||||
[59] Char ฑ: 0.026696195416150443 %
|
||||
[60] Char ๊: 0.017672129359986917 %
|
||||
[61] Char ฬ: 0.01692012385530662 %
|
||||
[62] Char ๋: 0.010058073625098937 %
|
||||
[63] Char ฆ: 0.009306068120418641 %
|
||||
[64] Char ฯ: 0.007614055734887979 %
|
||||
[65] Char ฌ: 0.0032900240829762876 %
|
||||
[66] Char ฺ: 0.0031020227068062137 %
|
||||
[67] Char ฃ: 9.400068808503678e-05 %
|
||||
[68] Char ํ: 9.400068808503678e-05 %
|
||||
|
||||
The first 69 characters have an accumulated ratio of 1.0.
|
||||
The first 9 characters have an accumulated ratio of 0.4296733852091798.
|
||||
All characters whose order is over 43 have an accumulated ratio of 0.031143367969453536.
|
||||
|
||||
2458 sequences found.
|
||||
|
||||
First 1634 (typical positive ratio): 0.9950175671509007
|
||||
Next 347 (1981-1634): 0.003986731842203994
|
||||
Rest: 0.0009957010068952776
|
||||
|
||||
- Processing end: 2022-12-15 00:23:35.424483
|
||||
225
script/BuildLangModelLogs/LangTurkishModel.log
Normal file
225
script/BuildLangModelLogs/LangTurkishModel.log
Normal file
@ -0,0 +1,225 @@
|
||||
= Logs of language model for Turkish (tr) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:23:54.224964
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Ana_Sayfa (revision 28930102)
|
||||
NASA (revision 28655062)
|
||||
Tepebaşı Tiyatrosu (revision 28726647)
|
||||
Müzik (revision 28867486)
|
||||
Marilyn Manson (revision 28771157)
|
||||
Rostislav (revision 24302020)
|
||||
Kahverengi pelikan (revision 26206141)
|
||||
Cahide Sonku (revision 28619115)
|
||||
Yevstafi (revision 27844178)
|
||||
Hırvatistan Savaşı (revision 28630176)
|
||||
Hunan (revision 28293939)
|
||||
Türk sineması (revision 28858174)
|
||||
Çürük Temel (revision 28740743)
|
||||
48. Altın Portakal Film Festivali (revision 24148018)
|
||||
Milan Kučan (revision 27271742)
|
||||
Yugoslavya Komünistler Birliği (revision 24987948)
|
||||
Skylab (revision 24865007)
|
||||
Holy Wood (In the Shadow of the Valley of Death) (revision 26336858)
|
||||
Tosca (revision 24870669)
|
||||
Türkçe (revision 28917063)
|
||||
Türkiye Cumhuriyeti Devlet Demiryolları (revision 28909701)
|
||||
Amazon Nehri (revision 25054700)
|
||||
Barack Obama (revision 28688807)
|
||||
Dita von Teese (revision 26342594)
|
||||
Yugoslav Halk Ordusu (revision 28868225)
|
||||
1984 yapımı Türk filmleri (revision 23700769)
|
||||
BMW (revision 28846977)
|
||||
I. Dünya Savaşı (revision 28889458)
|
||||
Muhsin Ertuğrul (revision 27603503)
|
||||
Zhou Qiang (revision 25082579)
|
||||
Varlık Vergisi (Türkiye) (revision 27855923)
|
||||
Millî Diyet Kütüphanesi (revision 28304907)
|
||||
Tepebaşı, Beyoğlu (revision 28853662)
|
||||
Malezya (revision 28854857)
|
||||
Binnaz (film, 1919) (revision 28053755)
|
||||
Rock is Dead (revision 25458774)
|
||||
Ulusal Arşivler ve Kayıtlar İdaresi (revision 28629611)
|
||||
Çalıkuşu (film, 1966) (revision 28185368)
|
||||
Nota (müzik) (revision 28874387)
|
||||
Jimnastik (revision 27595022)
|
||||
Hüseyin Suat Yalçın (revision 25645945)
|
||||
Kardeş gemi (revision 24248131)
|
||||
İzmir (revision 28898361)
|
||||
Xiangxi Tujia ve Miao Özerk İli (revision 28317158)
|
||||
Engin Alkan (revision 28675959)
|
||||
Disposable Teens (revision 25452472)
|
||||
2015 yapımı Türk filmleri (revision 28212202)
|
||||
Kemer zırhı (revision 24250603)
|
||||
58. Antalya Altın Portakal Film Festivali (revision 28191973)
|
||||
Deplasman (denizcilik) (revision 25792593)
|
||||
Türkiye (revision 28929245)
|
||||
Antalya Altın Portakal Film Festivali Sinema Yazarları Derneği (SİYAD) Ödülü (revision 27710192)
|
||||
Müzik şirketi (revision 28736884)
|
||||
Breslau (revision 28750401)
|
||||
Dünya Doğa ve Doğal Kaynakları Koruma Birliği (revision 28779552)
|
||||
33. Altın Portakal Film Festivali (revision 22753030)
|
||||
Eliza Binemeciyan (revision 26305286)
|
||||
Korsanmartıgiller (revision 27579708)
|
||||
İstanbul (revision 28933252)
|
||||
Müzisyen (revision 28140216)
|
||||
Kaliforniya (revision 27662958)
|
||||
Ankara (revision 28854191)
|
||||
Barselona (revision 28726854)
|
||||
Antichrist Superstar (şarkı) (revision 25458548)
|
||||
Buhar motoru (revision 28754565)
|
||||
Rus İmparatorluğu (revision 28934408)
|
||||
Kongre Kütüphanesi Kontrol Numarası (revision 27257543)
|
||||
Loudi (revision 25048862)
|
||||
Dreadnought (revision 26536776)
|
||||
Callisto Guatelli (revision 27759427)
|
||||
Pervane (revision 28641091)
|
||||
Zeynep Oral (revision 27809479)
|
||||
İsrail Ulusal Kütüphanesi (revision 27953386)
|
||||
WorldCat (revision 28632980)
|
||||
Tiyatro (revision 28917202)
|
||||
Gemini 11 (revision 28810810)
|
||||
Huaihua (revision 25005062)
|
||||
Derleme albüm (revision 25143874)
|
||||
Pelikan (revision 28223952)
|
||||
Beyaz Ordu (revision 24546589)
|
||||
Bolşevik (revision 27750483)
|
||||
Türkiye Radyo Televizyon Kurumu (revision 28871793)
|
||||
Ekvador (revision 28853507)
|
||||
Zihni Küçümen (revision 28494736)
|
||||
Uçan Süpürge Kadın Filmleri Festivali (revision 27561219)
|
||||
Amfibiler (revision 28750942)
|
||||
Koltuk kapasitesi (revision 24253080)
|
||||
Working Class Hero (revision 25009659)
|
||||
Dayton Anlaşması (revision 27353673)
|
||||
Zırhlı kule (revision 26067423)
|
||||
Suya iniş (revision 28184113)
|
||||
Wayback Machine (revision 28510856)
|
||||
Altın Portakal Cahide Sonku Ödülü (revision 27859679)
|
||||
Ön dretnot (revision 25434501)
|
||||
Pasifik Okyanusu (revision 28017484)
|
||||
Emigrate (revision 26145465)
|
||||
Dubrovnik (revision 26448353)
|
||||
Türkiye cumhurbaşkanı yardımcısı (revision 27626257)
|
||||
Karayipler (revision 28776869)
|
||||
Comédie-Française (revision 24983063)
|
||||
Vostok 4 (revision 26943553)
|
||||
Taret (revision 28873740)
|
||||
Rıdvan İsmail Paşa (revision 28670184)
|
||||
Deniz mili (revision 26177486)
|
||||
Müzikoloji (revision 28809569)
|
||||
Gemi (revision 28855280)
|
||||
Komedi filmi (revision 28802984)
|
||||
Eat Me, Drink Me (revision 25055615)
|
||||
Yugoslavya'nın dağılması (revision 27609767)
|
||||
Yünnan (revision 28325866)
|
||||
1954 yapımı Türk filmleri (revision 28924442)
|
||||
Uluslararası Antalya Film Festivali En İyi Film Ödülü (revision 27451738)
|
||||
Yueyang (revision 24987944)
|
||||
Kuş (revision 28634965)
|
||||
Raşit Rıza Samako (revision 26958699)
|
||||
Kırım (revision 28830891)
|
||||
Darülbedayi (revision 28820785)
|
||||
Rusçanın romanizasyonu (revision 28242435)
|
||||
II. Meşrutiyet (revision 28721667)
|
||||
Uluslararası Antalya Film Festivali En İyi Erkek Oyuncu Ödülü (revision 27451739)
|
||||
Aerodinamik (revision 28870696)
|
||||
Yüzölçümlerine göre ülkeler listesi (revision 28845422)
|
||||
Astonishing Panorama of the Endtimes (revision 25062769)
|
||||
John Edward Gray (revision 24948062)
|
||||
İstanbul Hukuk Mektebi (revision 28773571)
|
||||
Carl Ebert (revision 28121564)
|
||||
Pandomim (revision 27438976)
|
||||
Deliorman (revision 27353284)
|
||||
Türkiye'nin coğrafi bölgeleri (revision 28819228)
|
||||
Avustralya (revision 28758673)
|
||||
Open Library (revision 26914531)
|
||||
Ekosistem (revision 27773274)
|
||||
Fransa Millî Kütüphanesi (revision 25755687)
|
||||
TASS (revision 28314549)
|
||||
1921 yapımı Türk filmleri (revision 27632504)
|
||||
Eski Zağra (revision 28139594)
|
||||
Demir Çağı (revision 27762262)
|
||||
D-8 (revision 28599031)
|
||||
Elihu Root (revision 26874553)
|
||||
Galatasaray, Beyoğlu (revision 27404962)
|
||||
Atlas Okyanusu (revision 28878245)
|
||||
Bülbül, Beyoğlu (revision 25961059)
|
||||
Tainted Love (revision 26078322)
|
||||
Vostok programı (revision 28658515)
|
||||
Subwoofer (revision 26481880)
|
||||
Dil (revision 28786336)
|
||||
Vostok 2 (revision 25770387)
|
||||
Familya (revision 28874065)
|
||||
Nara (revision 25962872)
|
||||
Azeriler (revision 28935130)
|
||||
Sultan Abdülmecid (revision 28838806)
|
||||
34. Altın Portakal Film Festivali (revision 22753048)
|
||||
Kantat (revision 24888918)
|
||||
Emigrate (albüm) (revision 25055679)
|
||||
Altın Orda Devleti (revision 28917747)
|
||||
İstiklal Caddesi (revision 28862637)
|
||||
Goyang Geumjeong Mağarası Katliamı (revision 28790237)
|
||||
Şerif Mehmed Rauf Paşa (revision 28721707)
|
||||
Union List of Artist Names (revision 28848291)
|
||||
Terrassa (revision 28468341)
|
||||
Altın Portakal Uluslararası Uzun Metraj Film Yarışması (revision 28930899)
|
||||
Akarsu (revision 28880115)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:26:41.274864
|
||||
|
||||
53 characters appeared 1003804 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char a: 11.993277572115673 %
|
||||
[ 1] Char e: 9.07109356009739 %
|
||||
[ 2] Char i: 8.535231977557372 %
|
||||
[ 3] Char n: 7.531649604902949 %
|
||||
[ 4] Char r: 7.317763228678109 %
|
||||
[ 5] Char l: 7.2905666843327985 %
|
||||
[ 6] Char ı: 4.669636701985647 %
|
||||
[ 7] Char k: 4.296257038226586 %
|
||||
[ 8] Char d: 4.227618140593183 %
|
||||
[ 9] Char t: 3.9963977031372657 %
|
||||
[10] Char s: 3.5254890396930074 %
|
||||
[11] Char m: 3.385919960470371 %
|
||||
[12] Char u: 3.140453714071671 %
|
||||
[13] Char y: 2.9774736900829244 %
|
||||
[14] Char o: 2.6290989077549 %
|
||||
[15] Char b: 2.126012647887436 %
|
||||
[16] Char ü: 1.8743698969121463 %
|
||||
[17] Char ş: 1.5681348151631196 %
|
||||
[18] Char v: 1.3409988404110764 %
|
||||
[19] Char g: 1.166960880809401 %
|
||||
[20] Char z: 1.1435499360432912 %
|
||||
[21] Char h: 1.0191232551374572 %
|
||||
[22] Char ç: 0.9318552227327246 %
|
||||
[23] Char c: 0.8979840686030341 %
|
||||
[24] Char p: 0.8860295436160844 %
|
||||
[25] Char ğ: 0.8248622240995254 %
|
||||
[26] Char ö: 0.7329120027415711 %
|
||||
[27] Char f: 0.5717251575008667 %
|
||||
[28] Char j: 0.12173691278377054 %
|
||||
[29] Char w: 0.07073093950611872 %
|
||||
[30] Char â: 0.051802941610115116 %
|
||||
[31] Char î: 0.028491617885563317 %
|
||||
[32] Char x: 0.017931787480424465 %
|
||||
[33] Char û: 0.011556040820717988 %
|
||||
|
||||
The first 34 characters have an accumulated ratio of 0.9997469625544425.
|
||||
The first 5 characters have an accumulated ratio of 0.4444901594335149.
|
||||
All characters whose order is over 23 have an accumulated ratio of 0.03317779168044757.
|
||||
|
||||
1018 sequences found.
|
||||
|
||||
First 535 (typical positive ratio): 0.995041989262771
|
||||
Next 164 (699-535): 0.003962858701427208
|
||||
Rest: 0.0009951520358018051
|
||||
|
||||
- Processing end: 2022-12-15 00:26:41.410783
|
||||
279
script/BuildLangModelLogs/LangUkrainianModel.log
Normal file
279
script/BuildLangModelLogs/LangUkrainianModel.log
Normal file
@ -0,0 +1,279 @@
|
||||
= Logs of language model for Ukrainian (uk) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-17 18:44:33.402117
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Операція_«Зільберфукс» (revision 37680420)
|
||||
Транспортне судно (revision 35090984)
|
||||
23 серпня (revision 36966610)
|
||||
Операція «Кілгол» (revision 35031376)
|
||||
Битва при Монте-Кассіно (revision 37799244)
|
||||
Планування операції (revision 36655146)
|
||||
Протитанкова гармата (revision 34967926)
|
||||
Радянські фронти часів Другої світової війни (revision 37075701)
|
||||
Операція «Везерюбунг» (revision 37799544)
|
||||
Поранений у бою (revision 37150681)
|
||||
Олександр Солженіцин (revision 37802645)
|
||||
Німецька імперія (revision 37724957)
|
||||
Радіаційний, хімічний, біологічний захист (revision 35207698)
|
||||
Артилерійський вогонь (revision 35906728)
|
||||
Океанографічне судно (revision 37766409)
|
||||
Wayback Machine (revision 37596310)
|
||||
1773 (revision 30421919)
|
||||
Рубіж (військова справа) (revision 18710733)
|
||||
Аспект (revision 36757836)
|
||||
Режим Віші (revision 36482509)
|
||||
Інформація (revision 37609117)
|
||||
Громадянська війна в Іспанії (revision 37006495)
|
||||
Друга світова війна (revision 37694876)
|
||||
День (revision 35387803)
|
||||
НКВС (revision 37782988)
|
||||
План «Гельб» (revision 36323339)
|
||||
13 жовтня (revision 37425950)
|
||||
Франція в Другій світовій війні (revision 37765912)
|
||||
Бакинська армія ППО (revision 36396350)
|
||||
Особливі заслуги перед Україною (revision 34996039)
|
||||
6 червня (revision 36967247)
|
||||
Фортифікаційне обладнання (revision 36658341)
|
||||
НАТО (revision 37663357)
|
||||
Інженерні війська (revision 37760820)
|
||||
Військово-лікарські комісії України (revision 37338993)
|
||||
Каспійська флотилія ВМФ СРСР (revision 36265095)
|
||||
Загін (військова справа) (revision 37118075)
|
||||
Рейнська операція (revision 34987656)
|
||||
Технічне забезпечення (revision 36658293)
|
||||
Операція «Едельвейс» (revision 25052827)
|
||||
Зріджений природний газ (revision 37234389)
|
||||
10 лютого (revision 36969165)
|
||||
Підполковник (revision 37429136)
|
||||
20 червня (revision 36974362)
|
||||
23 березня (revision 37424209)
|
||||
Вінстон Черчилль (revision 37617124)
|
||||
Батальйон (revision 37579056)
|
||||
Зниклий безвісти (revision 37150728)
|
||||
16 квітня (revision 37544630)
|
||||
Житомирсько-Бердичівська операція (revision 37765688)
|
||||
Артилерійська система (revision 35906687)
|
||||
Україна в Другій світовій війні (revision 35102511)
|
||||
СРСР (revision 37786142)
|
||||
Військова справа (revision 36081819)
|
||||
Медвеж'єгорська оперативна група (revision 35139881)
|
||||
1-ша піхотна дивізія (Канада) (revision 36888484)
|
||||
Репатріація (revision 37609719)
|
||||
Операція «Календа» (revision 34691025)
|
||||
19 серпня (revision 37161110)
|
||||
Контейнеровоз (revision 36075092)
|
||||
Бойовий статут (revision 36634840)
|
||||
Рефрижераторне судно (revision 37766332)
|
||||
6-та гвардійська танкова армія (СРСР) (revision 36484600)
|
||||
Гастарбайтер (revision 36108975)
|
||||
За поранення (нагрудний знак) (revision 34927306)
|
||||
Єльнинсько-Дорогобузька операція (revision 35793148)
|
||||
Ролкер (revision 37766334)
|
||||
ЗІС-2 (revision 33263795)
|
||||
Обчислювальна техніка (revision 37425821)
|
||||
Спеціальні війська (revision 35033617)
|
||||
Радіовежа (revision 34350408)
|
||||
Операція «Тезей» (revision 33350126)
|
||||
Видача козаків у Лієнці (revision 37351799)
|
||||
Операція «Вігорос» (revision 37509297)
|
||||
Протягання під кілем (revision 26271627)
|
||||
Еріх Редер (revision 37407936)
|
||||
Друга Яссько-Кишинівська операція (revision 37802348)
|
||||
2-й Прибалтійський фронт (revision 35908523)
|
||||
Битва у затоці Сидра (1941) (revision 37678820)
|
||||
Ракетний удар (revision 37295254)
|
||||
Допоміжне військове судно (revision 31395394)
|
||||
Балкер (revision 35942310)
|
||||
Перша словацька республіка (revision 37240751)
|
||||
Інвалід війни (revision 36724722)
|
||||
Matilda II (revision 35665157)
|
||||
Операція «Тайфун» (revision 27726969)
|
||||
Велика Британія (revision 37721780)
|
||||
Воєнний час (revision 36741277)
|
||||
Військові втрати (revision 36544558)
|
||||
Нарвік (revision 36746700)
|
||||
Пакет-судно (revision 37442334)
|
||||
Поранення (revision 37241274)
|
||||
Інтернування (revision 35910578)
|
||||
План операції (revision 36655146)
|
||||
Військовий жаргон (revision 36133497)
|
||||
Європа (revision 37605430)
|
||||
Окупація Греції країнами Осі (revision 35923077)
|
||||
Туапсинський оборонний район (revision 35096716)
|
||||
Вітебсько-Оршанська операція (revision 36546532)
|
||||
Військова операція (revision 36634889)
|
||||
Військове транспортне судно (revision 31110210)
|
||||
Прибалтійська операція (1944) (revision 37573989)
|
||||
Міністерство соціальної політики України (revision 36827291)
|
||||
Пором (revision 37766348)
|
||||
Операція «Мінсміт» (revision 37680451)
|
||||
Командувач (revision 35052770)
|
||||
Танк (revision 37751071)
|
||||
Пурпурне Серце (США) (revision 35444570)
|
||||
4 січня (revision 37356925)
|
||||
Ялтинська конференція (revision 36551994)
|
||||
Зброя (revision 35324714)
|
||||
27 вересня (revision 37209489)
|
||||
КВ-1 (revision 34967853)
|
||||
1673 (revision 33487225)
|
||||
16 червня (revision 36967399)
|
||||
Неділя (revision 36755940)
|
||||
Поранений в бою (revision 37150681)
|
||||
Експеримент із часом (revision 34883427)
|
||||
Плавуча база (revision 37766371)
|
||||
Сталін і релігія (revision 37743071)
|
||||
Калузька операція (revision 35801942)
|
||||
Кавказ (revision 37660429)
|
||||
Сінгапурська оборона (revision 36249620)
|
||||
Мир (revision 35400124)
|
||||
Розпад СРСР (revision 37763852)
|
||||
Операція «Уайт» (revision 35770525)
|
||||
Бурове судно (revision 37766395)
|
||||
Огаденська війна (revision 34632675)
|
||||
Кашкет (revision 37362104)
|
||||
РСЧА (revision 37509859)
|
||||
Стратегічні бомбардування в період Другої світової війни (revision 35170307)
|
||||
Окупація Австрії союзниками (revision 33198050)
|
||||
Шієн (revision 35537380)
|
||||
Посадження на палю (revision 30927864)
|
||||
5 січня (revision 36973301)
|
||||
1936 (revision 36692601)
|
||||
22 грудня (revision 37790943)
|
||||
Тактика (revision 35077432)
|
||||
Практика (revision 29380696)
|
||||
Місцеві вибори в Україні (revision 35930899)
|
||||
Вальтер Гевель (revision 36247972)
|
||||
2 червня (revision 36973908)
|
||||
Міжнародне космічне право (revision 35500957)
|
||||
Сленг (revision 37309330)
|
||||
M15/42 (revision 36995811)
|
||||
Союз Радянських Соціалістичних Республік (revision 37786142)
|
||||
Берестейська унія (revision 37281794)
|
||||
Шемшученко Юрій Сергійович (revision 36970654)
|
||||
Владислав Реймонт (revision 36305880)
|
||||
План «Гертруда» (revision 28888567)
|
||||
1-ша гвардійська армія (СРСР) (revision 36396299)
|
||||
Перемовини про обмеження стратегічних озброєнь (revision 37364095)
|
||||
Війни початку сучасної епохи (revision 34001433)
|
||||
Циркон (ракета) (revision 37517270)
|
||||
Стратегічна оборонна операція (revision 35056822)
|
||||
26 листопада (revision 37656690)
|
||||
Скрипниченко Дмитро Федорович (revision 33953482)
|
||||
Женевський саміт (1955) (revision 35669111)
|
||||
Митна енциклопедія (revision 35408437)
|
||||
Інженерний батальйон штурму та розгородження (revision 36185501)
|
||||
Прожитковий мінімум (revision 37547840)
|
||||
Регіон (revision 37055617)
|
||||
Бойова техніка (revision 35977735)
|
||||
1944 (revision 36506280)
|
||||
Тилове забезпечення (revision 36658279)
|
||||
Німецька мова (revision 36867771)
|
||||
Музичний розмір (revision 33593201)
|
||||
Лепель (revision 37644747)
|
||||
16 січня (revision 37357430)
|
||||
Нафтоналивні танкери (revision 31054818)
|
||||
Ніколаєв Андріян Григорович (revision 37590029)
|
||||
Німецько-радянська війна (revision 37509302)
|
||||
Чортків (revision 37780071)
|
||||
Відеогра (revision 37487609)
|
||||
Повітряно-десантні війська (revision 36977054)
|
||||
256-та піхотна дивізія (Третій Рейх) (revision 35318281)
|
||||
Закони та звичаї війни (revision 37544276)
|
||||
Ежен Мішель Антоніаді (revision 37531874)
|
||||
Сербська мова (revision 36916280)
|
||||
Німеччина (revision 37763492)
|
||||
Навчальний корабель (revision 35139505)
|
||||
Антикомінтернівський пакт (revision 37497934)
|
||||
Фундація Альфреда Слоуна (revision 35417522)
|
||||
Крилов Микола Іванович (revision 37099460)
|
||||
9-та армія (Третій Рейх) (revision 35332833)
|
||||
Нагрудний знак (revision 34887537)
|
||||
Арсеній Яценюк (revision 37794125)
|
||||
HMS Valiant (1914) (revision 35630087)
|
||||
Зовнішні Гебридські острови (revision 35058803)
|
||||
Диліжанс (revision 37205155)
|
||||
Х-59 (revision 37749210)
|
||||
Четвертування (revision 35201022)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-17 18:48:43.498304
|
||||
|
||||
70 characters appeared 1713745 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char о: 8.849624652442458 %
|
||||
[ 1] Char а: 8.07704763544168 %
|
||||
[ 2] Char н: 7.530700308388938 %
|
||||
[ 3] Char і: 6.208800025674765 %
|
||||
[ 4] Char р: 5.607893823176727 %
|
||||
[ 5] Char и: 5.474151638662695 %
|
||||
[ 6] Char в: 5.214836513016814 %
|
||||
[ 7] Char т: 4.518992032070115 %
|
||||
[ 8] Char е: 4.39878745087513 %
|
||||
[ 9] Char с: 4.367102456899947 %
|
||||
[10] Char к: 3.8551826555292648 %
|
||||
[11] Char л: 3.499120347542954 %
|
||||
[12] Char у: 3.166690493626531 %
|
||||
[13] Char д: 3.067025724363893 %
|
||||
[14] Char п: 2.82644150675859 %
|
||||
[15] Char м: 2.7081625329322625 %
|
||||
[16] Char я: 2.1060309439268967 %
|
||||
[17] Char з: 2.019495315814196 %
|
||||
[18] Char ь: 1.87939279180975 %
|
||||
[19] Char г: 1.5261897190072033 %
|
||||
[20] Char б: 1.4648036901639392 %
|
||||
[21] Char й: 1.4100114077648658 %
|
||||
[22] Char ч: 1.1883915051539173 %
|
||||
[23] Char х: 1.0766479260333364 %
|
||||
[24] Char ц: 0.9475738805948377 %
|
||||
[25] Char ї: 0.946290142349066 %
|
||||
[26] Char ю: 0.7253704605994474 %
|
||||
[27] Char ж: 0.655931891850888 %
|
||||
[28] Char ш: 0.5775655071203709 %
|
||||
[29] Char ф: 0.41569778467624996 %
|
||||
[30] Char є: 0.39264884799080374 %
|
||||
[31] Char e: 0.2995778251723564 %
|
||||
[32] Char i: 0.28358944883865456 %
|
||||
[33] Char a: 0.282013951900662 %
|
||||
[34] Char щ: 0.25488039352412406 %
|
||||
[35] Char n: 0.2371414650370971 %
|
||||
[36] Char r: 0.19460304771129894 %
|
||||
[37] Char s: 0.18100709265380788 %
|
||||
[38] Char t: 0.1762806018398303 %
|
||||
[39] Char o: 0.16595234413521265 %
|
||||
[40] Char c: 0.14325351788043145 %
|
||||
[41] Char h: 0.11687853210366769 %
|
||||
[42] Char l: 0.11506962821189849 %
|
||||
[43] Char m: 0.10269905966173497 %
|
||||
[44] Char d: 0.09663047886354154 %
|
||||
[45] Char b: 0.09487992671021651 %
|
||||
[46] Char u: 0.07801627429985208 %
|
||||
[47] Char w: 0.06774636833367859 %
|
||||
[48] Char k: 0.06161943579704099 %
|
||||
[49] Char p: 0.05905195930549761 %
|
||||
[50] Char y: 0.056776241506175065 %
|
||||
[51] Char g: 0.05304173024574835 %
|
||||
[52] Char f: 0.046973149447554916 %
|
||||
[53] Char v: 0.03985423735736647 %
|
||||
[54] Char x: 0.02404091623899705 %
|
||||
[55] Char ы: 0.01651354197969943 %
|
||||
[56] Char z: 0.016338486764366927 %
|
||||
[57] Char ґ: 0.01312914114993771 %
|
||||
|
||||
The first 58 characters have an accumulated ratio of 0.9998016040892901.
|
||||
The first 6 characters have an accumulated ratio of 0.4174821808378726.
|
||||
All characters whose order is over 30 have an accumulated ratio of 0.032775587966704496.
|
||||
|
||||
1538 sequences found.
|
||||
|
||||
First 818 (typical positive ratio): 0.9950006474582738
|
||||
Next 269 (1087-818): 0.004005190805257075
|
||||
Rest: 0.0009941617364691568
|
||||
|
||||
- Processing end: 2022-12-17 18:48:43.660343
|
||||
262
script/BuildLangModelLogs/LangVietnameseModel.log
Normal file
262
script/BuildLangModelLogs/LangVietnameseModel.log
Normal file
@ -0,0 +1,262 @@
|
||||
= Logs of language model for Vietnamese (vi) =
|
||||
|
||||
- Generated by BuildLangModel.py
|
||||
- Started: 2022-12-15 00:26:58.741409
|
||||
- Maximum depth: 4
|
||||
- Max number of pages: 200
|
||||
|
||||
== Parsed pages ==
|
||||
|
||||
Chữ_Quốc_ngữ (revision 69323365)
|
||||
Nho giáo (revision 69390317)
|
||||
Phương ngữ tiếng Việt (revision 69253911)
|
||||
Serampore (revision 69215747)
|
||||
Vi hiến (revision 68604810)
|
||||
Bàn phím máy tính (revision 69177994)
|
||||
Văn hóa Pháp (revision 54514660)
|
||||
Quốc ngữ (revision 69389443)
|
||||
Ngôn ngữ thanh điệu (revision 69010377)
|
||||
Chữ Quốc ngữ (revision 69323365)
|
||||
Đặc khu hành chính Cộng hòa Nhân dân Trung Hoa (revision 69426134)
|
||||
Phân biệt chủng tộc (revision 69056376)
|
||||
Tiếng Cornwall (revision 66276356)
|
||||
Bảng chữ cái tiếng Việt (revision 69323365)
|
||||
Tiếng Khmer (revision 69325997)
|
||||
Nhóm ngôn ngữ Rôman (revision 68954316)
|
||||
Tiếng Limburg (revision 68607883)
|
||||
Pháp (revision 69398708)
|
||||
Tiếng Yoruba (revision 69118098)
|
||||
ISBN (revision 68690711)
|
||||
Kathmandu (revision 68690328)
|
||||
Đồng bằng sông Cửu Long (revision 69246438)
|
||||
Tiếng Anh (revision 69388570)
|
||||
De facto (revision 69400492)
|
||||
Thành ngữ gốc Hán trong tiếng Việt (revision 69281032)
|
||||
T (revision 69090121)
|
||||
Sử dụng i và y trong chữ Quốc ngữ (revision 69323365)
|
||||
Văn hóa cao cấp (revision 67498856)
|
||||
Từ vựng tiếng Việt (revision 68581098)
|
||||
Bắc Bộ Việt Nam (revision 69358947)
|
||||
Duyên hải Nam Trung Bộ (revision 69340706)
|
||||
Từ mượn trong tiếng Việt (revision 69411530)
|
||||
Tiếng Breton (revision 66407538)
|
||||
Chủ nghĩa quốc xã (revision 69296691)
|
||||
BBC (revision 69385173)
|
||||
Đông Nam Á (revision 69258873)
|
||||
Backspace (revision 69372112)
|
||||
5 (số) (revision 66943809)
|
||||
Lhasa (revision 68702064)
|
||||
Tây Bắc Trung Quốc (revision 68370855)
|
||||
Kim Định (revision 68482544)
|
||||
Nam Bộ (Việt Nam) (revision 69385658)
|
||||
Chủ nghĩa dân tộc Nhật Bản (revision 69282936)
|
||||
Chủ nghĩa phát xít (revision 69427677)
|
||||
Độc lập (revision 69282907)
|
||||
Trung Nguyên (revision 68471807)
|
||||
Ký hiệu đô la (revision 68063682)
|
||||
Alaska (revision 68488368)
|
||||
Hiến pháp Cộng hòa Nhân dân Trung Hoa (revision 68618923)
|
||||
Viện hàn lâm Pháp (revision 67842447)
|
||||
Tiếng Wales (revision 69231563)
|
||||
Đồng tính luyến ái ở Trung Quốc (revision 69292330)
|
||||
Điện ảnh Pháp (revision 66377653)
|
||||
Toán học (revision 69375403)
|
||||
Ngữ pháp tiếng Việt (revision 68694548)
|
||||
Ngữ hệ Nin-Sahara (revision 69193091)
|
||||
Ngữ hệ Niger-Congo (revision 67824951)
|
||||
Nguồn máy tính (revision 68384586)
|
||||
Nhân dân tệ (revision 68764203)
|
||||
Tiếng Takua (revision 64937724)
|
||||
François Mauriac (revision 69191804)
|
||||
Quyền công dân (revision 69280750)
|
||||
Đặc khu liên bang (revision 68277202)
|
||||
Đô la Singapore (revision 68820222)
|
||||
V (revision 68453484)
|
||||
Augustus (revision 69427224)
|
||||
1955 (revision 69397550)
|
||||
Tiếng Nga (revision 69231375)
|
||||
Tây Ninh (revision 69265228)
|
||||
Trùng Khánh (revision 69281733)
|
||||
Hạ Long (revision 69237682)
|
||||
Tây Bắc Bộ (revision 69246433)
|
||||
Danh sách đơn vị hành chính Trung Quốc theo GDP bình quân đầu người (revision 68644474)
|
||||
VIQR (revision 69426437)
|
||||
Tư tưởng (revision 69179944)
|
||||
CBeebies (revision 69386322)
|
||||
Ngữ hệ Nam Đảo (revision 69193090)
|
||||
Chủ nghĩa Trump (revision 69282945)
|
||||
Công ty (revision 69075555)
|
||||
Chiến tranh Đông Dương (revision 69415170)
|
||||
Quy tắc đặt dấu thanh trong chữ quốc ngữ (revision 69127815)
|
||||
Điện ảnh Triều Tiên (revision 69257947)
|
||||
Châu Âu (revision 69335158)
|
||||
Năm (revision 68407392)
|
||||
Văn minh (revision 69261968)
|
||||
Ngữ hệ (revision 69193082)
|
||||
Bảng chữ cái Hy Lạp (revision 68485721)
|
||||
Vạn lý Trường chinh (revision 69246605)
|
||||
Hoài Hà (revision 68738981)
|
||||
Tiếng Xá Phó (revision 65405578)
|
||||
JSTOR (định danh) (revision 68334098)
|
||||
Diode (revision 69383509)
|
||||
Ngũ Chỉ Sơn (núi Trung Quốc) (revision 65453256)
|
||||
Hương (Trung Quốc) (revision 67193970)
|
||||
Thành ngữ (Tiếng Việt) (revision 69235423)
|
||||
Síp (revision 69329436)
|
||||
Baht Thái Lan (revision 69145112)
|
||||
Lý thuyết hình thái (revision 68862337)
|
||||
Jerusalem (revision 69289046)
|
||||
Ổ đĩa cứng (revision 69048111)
|
||||
Cà chua (revision 69199560)
|
||||
Kuala Lumpur (revision 69364800)
|
||||
Chủ nghĩa phân biệt chủng tộc (revision 69056376)
|
||||
Nam Giang, Quảng Nam (revision 68988006)
|
||||
Truyện ngắn (revision 69179000)
|
||||
Ẩm thực Pháp (revision 68242821)
|
||||
Cư Kuin (revision 68704228)
|
||||
Kiểm soát tính nhất quán (revision 69202370)
|
||||
Thư viện Quốc gia Pháp (revision 69229836)
|
||||
Nhóm ngôn ngữ Oïl (revision 66609000)
|
||||
Su hào (revision 69219380)
|
||||
Amin Maalouf (revision 64372255)
|
||||
Nguyễn Quảng Tuân (revision 65740823)
|
||||
Khải Định (revision 69352546)
|
||||
Úc (revision 69392760)
|
||||
Chủ nghĩa quốc tế (revision 68941294)
|
||||
SQL (revision 68382997)
|
||||
Huelgoat (revision 36181427)
|
||||
Hôn nhân màu tím (revision 64517405)
|
||||
Đế quốc Đông La Mã (revision 69311499)
|
||||
Phương ngữ Thanh Hóa (revision 69393354)
|
||||
Quần đảo Alexander (revision 68217402)
|
||||
Rết (revision 69210397)
|
||||
OCLC (định danh) (revision 68689511)
|
||||
Biến áp (revision 69414052)
|
||||
1888 (revision 69378216)
|
||||
Nhà nước Palestine (revision 69193568)
|
||||
Tenge Kazakhstan (revision 65144240)
|
||||
Vương Kỳ Sơn (revision 69243189)
|
||||
ISO 639-2 (revision 67773556)
|
||||
Tiếng Afrikaans (revision 68890705)
|
||||
Ngữ hệ Dené–Enisei (revision 67228355)
|
||||
Hệ ngôn ngữ (revision 69193082)
|
||||
Văn học dân gian (revision 68803869)
|
||||
Maastricht (revision 68124834)
|
||||
Louis de Funès (revision 68905491)
|
||||
Tây Tạng (revision 69238053)
|
||||
Máy chủ (revision 66645190)
|
||||
Yên Lộc, Kim Sơn (revision 67718890)
|
||||
|
||||
== End of Parsed pages ==
|
||||
|
||||
- Wikipedia parsing ended at: 2022-12-15 00:50:53.987711
|
||||
|
||||
107 characters appeared 1640238 times.
|
||||
|
||||
Most Frequent characters:
|
||||
[ 0] Char n: 11.695497848482963 %
|
||||
[ 1] Char h: 8.696177018213211 %
|
||||
[ 2] Char t: 6.726036099639199 %
|
||||
[ 3] Char c: 6.444918359408818 %
|
||||
[ 4] Char i: 5.80403575578666 %
|
||||
[ 5] Char g: 5.410800139979686 %
|
||||
[ 6] Char a: 3.645263675149582 %
|
||||
[ 7] Char u: 3.0321819150635454 %
|
||||
[ 8] Char đ: 2.4478154999457398 %
|
||||
[ 9] Char o: 2.4337321778912573 %
|
||||
[10] Char m: 2.4094064397971513 %
|
||||
[11] Char à: 2.1078038674875232 %
|
||||
[12] Char r: 2.0719554113488408 %
|
||||
[13] Char v: 2.0377530577879552 %
|
||||
[14] Char l: 2.0084280451983187 %
|
||||
[15] Char p: 1.8263203266842984 %
|
||||
[16] Char á: 1.625922579528093 %
|
||||
[17] Char ư: 1.563004880998977 %
|
||||
[18] Char s: 1.5177065767284992 %
|
||||
[19] Char b: 1.3103586186882634 %
|
||||
[20] Char y: 1.2965191636823437 %
|
||||
[21] Char k: 1.1537350067490206 %
|
||||
[22] Char d: 1.0767339861654224 %
|
||||
[23] Char e: 1.0382639592546936 %
|
||||
[24] Char ế: 0.9937582228920436 %
|
||||
[25] Char ộ: 0.8638380527703906 %
|
||||
[26] Char â: 0.8303063335930517 %
|
||||
[27] Char ệ: 0.8017129221491028 %
|
||||
[28] Char ố: 0.7481231382275012 %
|
||||
[29] Char ạ: 0.7387342568578463 %
|
||||
[30] Char ô: 0.7038003021512732 %
|
||||
[31] Char ê: 0.6733778878431057 %
|
||||
[32] Char ủ: 0.6715488849788872 %
|
||||
[33] Char ó: 0.6418580718164072 %
|
||||
[34] Char q: 0.6322862901603303 %
|
||||
[35] Char ả: 0.6050951142456156 %
|
||||
[36] Char ớ: 0.5788794065251507 %
|
||||
[37] Char ữ: 0.5116940346461916 %
|
||||
[38] Char ờ: 0.4946843080089597 %
|
||||
[39] Char ợ: 0.48773409712492943 %
|
||||
[40] Char ề: 0.4856612272121485 %
|
||||
[41] Char í: 0.4462157321071698 %
|
||||
[42] Char ấ: 0.44463059629151375 %
|
||||
[43] Char ị: 0.44188709199518605 %
|
||||
[44] Char ể: 0.41652491894468974 %
|
||||
[45] Char ậ: 0.4031731980358948 %
|
||||
[46] Char ự: 0.38982147712709986 %
|
||||
[47] Char ă: 0.3869560393064909 %
|
||||
[48] Char ơ: 0.37067791381494636 %
|
||||
[49] Char ầ: 0.3274524794572495 %
|
||||
[50] Char x: 0.32318480610740635 %
|
||||
[51] Char ã: 0.3056263786109089 %
|
||||
[52] Char ở: 0.30526057803806517 %
|
||||
[53] Char ì: 0.29763973277048816 %
|
||||
[54] Char ứ: 0.29264045827495766 %
|
||||
[55] Char ọ: 0.2834954439538652 %
|
||||
[56] Char ụ: 0.2792887373661627 %
|
||||
[57] Char ồ: 0.25880390528691566 %
|
||||
[58] Char ừ: 0.24593991847524566 %
|
||||
[59] Char ổ: 0.24423284913530843 %
|
||||
[60] Char ù: 0.21685877293417177 %
|
||||
[61] Char ắ: 0.1993003454376743 %
|
||||
[62] Char ú: 0.18064451622264574 %
|
||||
[63] Char ặ: 0.16912179817806927 %
|
||||
[64] Char ò: 0.1583306812791802 %
|
||||
[65] Char ử: 0.1311395053644654 %
|
||||
[66] Char ĩ: 0.12790826697101274 %
|
||||
[67] Char ằ: 0.1251037959125444 %
|
||||
[68] Char f: 0.12418929448043516 %
|
||||
[69] Char ỉ: 0.12144579018410742 %
|
||||
[70] Char ũ: 0.12120192313554497 %
|
||||
[71] Char ý: 0.1130323770087024 %
|
||||
[72] Char é: 0.09620555065789232 %
|
||||
[73] Char w: 0.0915720767352055 %
|
||||
[74] Char ỏ: 0.0730991478065988 %
|
||||
[75] Char j: 0.0643809008204907 %
|
||||
[76] Char ẫ: 0.06328349910195959 %
|
||||
[77] Char ễ: 0.05816229108214783 %
|
||||
[78] Char ẩ: 0.05712585612575736 %
|
||||
[79] Char ỳ: 0.04938307733389911 %
|
||||
[80] Char ẽ: 0.04688344008613384 %
|
||||
[81] Char ỷ: 0.046273772464727685 %
|
||||
[82] Char z: 0.03883582748357251 %
|
||||
[83] Char ỗ: 0.03883582748357251 %
|
||||
[84] Char ỹ: 0.033714619463760746 %
|
||||
[85] Char è: 0.02395993752126216 %
|
||||
[86] Char ẳ: 0.023898970759121542 %
|
||||
[87] Char ẻ: 0.022557701992027987 %
|
||||
[88] Char ẹ: 0.01755842749649746 %
|
||||
[89] Char õ: 0.017314560447934994 %
|
||||
[90] Char ỡ: 0.016521992540106986 %
|
||||
[91] Char ẵ: 0.005791842403358537 %
|
||||
[92] Char ç: 0.005182174781952375 %
|
||||
[93] Char ỵ: 0.0038409060148588197 %
|
||||
|
||||
The first 94 characters have an accumulated ratio of 0.9998957468367395.
|
||||
The first 6 characters have an accumulated ratio of 0.4477746522151054.
|
||||
All characters whose order is over 57 have an accumulated ratio of 0.03172832235321948.
|
||||
|
||||
1992 sequences found.
|
||||
|
||||
First 1145 (typical positive ratio): 0.9950096057827752
|
||||
Next 346 (1491-1145): 0.003994365518613985
|
||||
Rest: 0.000996028698610818
|
||||
|
||||
- Processing end: 2022-12-15 00:50:54.611610
|
||||
65
script/README
Normal file
65
script/README
Normal file
@ -0,0 +1,65 @@
|
||||
# Supporting new or Updating languages #
|
||||
|
||||
We generate statistical language data using Wikipedia as natural
|
||||
language text resource.
|
||||
|
||||
Right now, we have automated scripts only to generate statistical data
|
||||
for single-byte encodings. Multi-byte encodings usually requires more
|
||||
in-depth knowledge of its specification.
|
||||
|
||||
## New single-byte encoding ##
|
||||
|
||||
Uchardet uses language data, and therefore rather than supporting a
|
||||
charset, we in fact support a couple (language, charset). So for
|
||||
instance if uchardet supports (French, ISO-8859-15), it should be able
|
||||
to recognize French text encoded in ISO-8859-15, but may fail at
|
||||
detecting ISO-8859-15 for non-supported languages.
|
||||
|
||||
This is why, though less flexible, it also makes uchardet much more
|
||||
accurate than other detection systems, as well as making it an efficient
|
||||
language recognition system.
|
||||
Since many single-byte charsets actually share the same layout (or very
|
||||
similar ones), it is actually impossible to have an accurate single-byte
|
||||
encoding detector for random text.
|
||||
|
||||
Therefore you need to describe the language and the codepoint layouts of
|
||||
every charset you want to add support for.
|
||||
|
||||
I recommend having a look at langs/fr.py which is heavily commented as
|
||||
a base of a new language description, and charsets/windows-1252.py as a
|
||||
base for a new charset layout (note that charset layouts can be shared
|
||||
between languages. If yours is already there, you have nothing to do).
|
||||
The important name in the charset file are:
|
||||
|
||||
- `name`: an iconv-compatible name.
|
||||
- `charmap`: fill it with CTR (control character), SYM (symbol), NUM
|
||||
(number), LET (letter), ILL (illegal codepoint).
|
||||
|
||||
## Tools ##
|
||||
|
||||
You must install Python 3 and the [`Wikipedia` Python
|
||||
tool](https://github.com/goldsmith/Wikipedia).
|
||||
|
||||
If requirements change, these will be updated in `requirements.txt`, so that you
|
||||
can just run `pip3 install -r requirements.txt`.
|
||||
|
||||
## Run script ##
|
||||
|
||||
Let's say you added (or modified) support for French (`fr`), run:
|
||||
|
||||
> ./BuildLangModel.py fr --max-page=200 --max-depth=4
|
||||
|
||||
The options can be changed to any value. Bigger values mean the script
|
||||
will process more data, so more processing time now, but uchardet may
|
||||
possibly be more accurate in the end.
|
||||
|
||||
## Updating core code ##
|
||||
|
||||
If you were only updating data for an existing language model, you have nothing
|
||||
else to do. Just build `uchardet` again and test it.
|
||||
|
||||
If you were creating new models though, you will have to add the sequence models
|
||||
in src/nsSBCSGroupProber.cpp and the language model in src/nsMBCSGroupProber.cpp.
|
||||
Finally add the new file in src/CMakeLists.txt.
|
||||
|
||||
I will be looking to make this step more straightforward in the future.
|
||||
79
script/charsets/cp737.py
Normal file
79
script/charsets/cp737.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# NOTE: I use CP737 and not IBM737 as the main encoding name, since iconv
|
||||
# conversion failed with IBM737 with the file from #21 and in BuildLangModel.py
|
||||
# script, even though these are supposed to be synonyms.
|
||||
name = 'CP737'
|
||||
aliases = ['IBM737', 'OEM 737', 'MS-DOS Greek']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Wikipedia tells us: Code page 737 (CCSID 737) (also known as CP 737,
|
||||
# IBM 00737, and OEM 737, MS-DOS Greek) is a code page used under DOS to
|
||||
# write the Greek language.[4] It was much more popular than code page
|
||||
# 869 although it lacks the letters ΐ and ΰ.
|
||||
'complete': [ 'el' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
|
||||
]
|
||||
88
script/charsets/georgian-academy.py
Normal file
88
script/charsets/georgian-academy.py
Normal file
@ -0,0 +1,88 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'GEORGIAN-ACADEMY'
|
||||
aliases = []
|
||||
|
||||
language = \
|
||||
{
|
||||
# Languages with complete coverage.
|
||||
'complete': [ 'ka' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
## Table generated by script/create-table.py with: ##
|
||||
## iconv (Debian GLIBC 2.31-13+deb11u5) 2.31 ##
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
# ' ' '!' '"' '#' '$' '%' '&' ''' '(' ')' '*' '+' ',' '-' '.' '/'
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
# '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?'
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
# '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
# 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' '[' '\' ']' '^' '_'
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
# '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
# 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}' '~' CTR
|
||||
CTR,CTR,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,CTR,CTR,CTR, # 8X
|
||||
# CTR CTR '‚' 'ƒ' '„' '…' '†' '‡' 'ˆ' '‰' 'Š' '‹' 'Œ' CTR CTR CTR
|
||||
CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,CTR,CTR,LET, # 9X
|
||||
# CTR '‘' '’' '“' '”' '•' '–' '—' '˜' '™' 'š' '›' 'œ' CTR CTR 'Ÿ'
|
||||
CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,CTR,SYM,SYM, # AX
|
||||
# CTR '¡' '¢' '£' '¤' '¥' '¦' '§' '¨' '©' 'ª' '«' '¬' CTR '®' '¯'
|
||||
SYM,SYM,NUM,NUM,SYM,LET,SYM,SYM,SYM,NUM,LET,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
# '°' '±' '²' '³' '´' 'µ' '¶' '·' '¸' '¹' 'º' '»' '¼' '½' '¾' '¿'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
# 'ა' 'ბ' 'გ' 'დ' 'ე' 'ვ' 'ზ' 'თ' 'ი' 'კ' 'ლ' 'მ' 'ნ' 'ო' 'პ' 'ჟ'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
# 'რ' 'ს' 'ტ' 'უ' 'ფ' 'ქ' 'ღ' 'ყ' 'შ' 'ჩ' 'ც' 'ძ' 'წ' 'ჭ' 'ხ' 'ჯ'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
# 'ჰ' 'ჱ' 'ჲ' 'ჳ' 'ჴ' 'ჵ' 'ჶ' 'ç' 'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï'
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
# 'ð' 'ñ' 'ò' 'ó' 'ô' 'õ' 'ö' '÷' 'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'þ' 'ÿ'
|
||||
]
|
||||
88
script/charsets/georgian-ps.py
Normal file
88
script/charsets/georgian-ps.py
Normal file
@ -0,0 +1,88 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'GEORGIAN-PS'
|
||||
aliases = []
|
||||
|
||||
language = \
|
||||
{
|
||||
# Languages with complete coverage.
|
||||
'complete': [ 'ka' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
## Table generated by script/create-table.py with: ##
|
||||
## iconv (Debian GLIBC 2.31-13+deb11u5) 2.31 ##
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
# ' ' '!' '"' '#' '$' '%' '&' ''' '(' ')' '*' '+' ',' '-' '.' '/'
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
# '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?'
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
# '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
# 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' '[' '\' ']' '^' '_'
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
# '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
# 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}' '~' CTR
|
||||
CTR,CTR,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,CTR,CTR,CTR, # 8X
|
||||
# CTR CTR '‚' 'ƒ' '„' '…' '†' '‡' 'ˆ' '‰' 'Š' '‹' 'Œ' CTR CTR CTR
|
||||
CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,CTR,CTR,LET, # 9X
|
||||
# CTR '‘' '’' '“' '”' '•' '–' '—' '˜' '™' 'š' '›' 'œ' CTR CTR 'Ÿ'
|
||||
CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,CTR,SYM,SYM, # AX
|
||||
# CTR '¡' '¢' '£' '¤' '¥' '¦' '§' '¨' '©' 'ª' '«' '¬' CTR '®' '¯'
|
||||
SYM,SYM,NUM,NUM,SYM,LET,SYM,SYM,SYM,NUM,LET,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
# '°' '±' '²' '³' '´' 'µ' '¶' '·' '¸' '¹' 'º' '»' '¼' '½' '¾' '¿'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
# 'ა' 'ბ' 'გ' 'დ' 'ე' 'ვ' 'ზ' 'ჱ' 'თ' 'ი' 'კ' 'ლ' 'მ' 'ნ' 'ჲ' 'ო'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
# 'პ' 'ჟ' 'რ' 'ს' 'ტ' 'ჳ' 'უ' 'ფ' 'ქ' 'ღ' 'ყ' 'შ' 'ჩ' 'ც' 'ძ' 'წ'
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
# 'ჭ' 'ხ' 'ჴ' 'ჯ' 'ჰ' 'ჵ' 'æ' 'ç' 'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï'
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
# 'ð' 'ñ' 'ò' 'ó' 'ô' 'õ' 'ö' '÷' 'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'þ' 'ÿ'
|
||||
]
|
||||
72
script/charsets/ibm852.py
Normal file
72
script/charsets/ibm852.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'IBM852'
|
||||
aliases = ['CP852']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl',
|
||||
'hsb', 'dsb', 'tk' ],
|
||||
'incomplete': [ 'ro' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET, # 9X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,SYM,SYM, # FX
|
||||
]
|
||||
75
script/charsets/ibm855.py
Normal file
75
script/charsets/ibm855.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'IBM855'
|
||||
aliases = ['CP855', 'OEM 855', 'MS-DOS Cyrillic']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Wikipedia tells us: At one time it was widely used in Serbia, Macedonia
|
||||
# and Bulgaria, but it never caught on in Russia, where Code page 866 was more
|
||||
# common. This code page is not used much.
|
||||
'complete': [ 'sr', 'mk', 'bg', 'ru' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM, # FX
|
||||
]
|
||||
71
script/charsets/ibm862.py
Normal file
71
script/charsets/ibm862.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'IBM862'
|
||||
aliases = ['CP862', 'OEM 862 (Hebrew)', 'MS-DOS Hebrew']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'he' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET, # 9X
|
||||
LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,SYM, # EX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
|
||||
]
|
||||
71
script/charsets/ibm865.py
Normal file
71
script/charsets/ibm865.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'IBM865'
|
||||
aliases = ['CP865', '865', 'CSIBM865']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'no', 'da' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,SYM,SYM, # 9X
|
||||
LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
|
||||
]
|
||||
72
script/charsets/ibm866.py
Normal file
72
script/charsets/ibm866.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'IBM866'
|
||||
aliases = ['CP866', 'DOS Cyrillic Russian']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bg', 'ru' ],
|
||||
'incomplete': [ 'uk', 'be' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
|
||||
]
|
||||
73
script/charsets/iso-8859-10.py
Normal file
73
script/charsets/iso-8859-10.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-10'
|
||||
aliases = ['ISO_8859-10:1992', 'ISO_8859-10', 'iso-ir-157',
|
||||
'csISOLatin6', 'latin6', 'l6']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Nordic languages. Supersedes ISO-8859-4.
|
||||
'complete': [ 'et', 'lv', 'lt', 'kl', 'saam1281' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,SYM,LET,LET, # AX
|
||||
SYM,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
77
script/charsets/iso-8859-11.py
Normal file
77
script/charsets/iso-8859-11.py
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1.
|
||||
# It is basically the same as ISO/CEI 8859-1, but with control characters.
|
||||
# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset,
|
||||
# so there is no need for us to support it anyway.
|
||||
|
||||
name = 'ISO-8859-11'
|
||||
aliases = []
|
||||
|
||||
language = \
|
||||
{
|
||||
# Designed for Thai language.
|
||||
'complete': ['th'],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET, # EX
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,LET,LET,ILL,ILL,ILL,ILL, # FX
|
||||
]
|
||||
72
script/charsets/iso-8859-13.py
Normal file
72
script/charsets/iso-8859-13.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-13'
|
||||
aliases = ['csISO885913']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Designed to cover Baltic languages.
|
||||
'complete': [ 'lv', 'lt' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
83
script/charsets/iso-8859-16.py
Normal file
83
script/charsets/iso-8859-16.py
Normal file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1.
|
||||
# It is basically the same as ISO/CEI 8859-1, but with control characters.
|
||||
# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset,
|
||||
# so there is no need for us to support it anyway.
|
||||
|
||||
name = 'ISO-8859-16'
|
||||
aliases = ['ISO_8859-16:2001', 'ISO_8859-16', 'iso-ir-226',
|
||||
'csISO885916', 'latin10', 'l10']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Languages with complete coverage.
|
||||
# Some languages actually have several alphabets and only one of them is
|
||||
# compatible with ISO-8859-1 (ex: Kurdish).
|
||||
# Some don't have a ISO language code (like Leonese, for which I used
|
||||
# a Glottolog code).
|
||||
'complete': [ 'sq', 'hr', 'hu', 'pl', 'ro', 'sr', 'sl',
|
||||
'fr', 'de', 'it', 'ga' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,LET,LET,SYM,SYM,LET,SYM,LET,SYM,LET,SYM,LET,SYM,LET,LET, # AX
|
||||
SYM,SYM,LET,LET,LET,SYM,SYM,SYM,LET,LET,LET,SYM,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
73
script/charsets/iso-8859-2.py
Normal file
73
script/charsets/iso-8859-2.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-2'
|
||||
aliases = ['ISO_8859-2:1987', 'ISO_8859-2', 'iso-ir-101',
|
||||
'csISOLatin2', 'latin2', 'l2']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl',
|
||||
'hsb', 'dsb', 'tk' ],
|
||||
'incomplete': [ 'ro' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET, # AX
|
||||
SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
75
script/charsets/iso-8859-3.py
Normal file
75
script/charsets/iso-8859-3.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# ISO-8859-3 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-3.
|
||||
# It is basically the same as ISO/CEI 8859-3, but with control characters.
|
||||
name = 'ISO-8859-3'
|
||||
aliases = ['ISO_8859-3:1988', 'ISO_8859-3', 'iso-ir-109',
|
||||
'csISOLatin3', 'latin3', 'l3']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Languages with complete coverage.
|
||||
'complete': [ 'eo', 'tr', 'mt' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,SYM,SYM,SYM,ILL,LET,SYM,SYM,LET,LET,LET,LET,SYM,ILL,LET, # AX
|
||||
SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,LET,LET,LET,SYM,ILL,LET, # BX
|
||||
LET,LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
ILL,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
ILL,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
73
script/charsets/iso-8859-4.py
Normal file
73
script/charsets/iso-8859-4.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-4'
|
||||
aliases = ['ISO_8859-2:1988', 'ISO_8859-4', 'iso-ir-110',
|
||||
'csISOLatin4', 'latin4', 'l4']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Nordic languages. Largely superseded by ISO-8859-10.
|
||||
'complete': [ 'et', 'lv', 'lt', 'kl', 'saam1281' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,LET,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,SYM,LET,SYM, # AX
|
||||
SYM,LET,SYM,LET,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
72
script/charsets/iso-8859-5.py
Normal file
72
script/charsets/iso-8859-5.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-5'
|
||||
aliases = ['ISO_8859-5:1988', 'ISO_8859-5', 'iso-ir-144',
|
||||
'cyrillic', 'csISOLatinCyrillic']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bg', 'be', 'ru', 'sr', 'mk' ],
|
||||
'incomplete': [ 'uk' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET, # AX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET, # FX
|
||||
]
|
||||
73
script/charsets/iso-8859-6.py
Normal file
73
script/charsets/iso-8859-6.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-6'
|
||||
aliases = ['ISO_8859-6:1987', 'ISO_8859-6', 'iso-ir-127',
|
||||
'ECMA-114', 'ASMO-708', 'arabic', 'csISOLatinArabic']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Dedicated to Arabic.
|
||||
'complete': [ 'ar' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,ILL,ILL,ILL,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,SYM,ILL,ILL, # AX
|
||||
ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,SYM,ILL,ILL,ILL,SYM, # BX
|
||||
ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,ILL,ILL,ILL,ILL,ILL, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # EX
|
||||
SYM,SYM,SYM,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL,ILL, # FX
|
||||
]
|
||||
73
script/charsets/iso-8859-7.py
Normal file
73
script/charsets/iso-8859-7.py
Normal file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-7'
|
||||
aliases = ['ISO_8859-7:1987', 'ISO_8859-7', 'iso-ir-126',
|
||||
'ELOT_928', 'ECMA-118', 'greek', 'greek8', 'csISOLatinGreek']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Dedicated to modern Greek.
|
||||
'complete': [ 'el' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,SYM,LET,SYM,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,ILL, # FX
|
||||
]
|
||||
72
script/charsets/iso-8859-8.py
Normal file
72
script/charsets/iso-8859-8.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'ISO-8859-8'
|
||||
aliases = ['ISO_8859-8:1988', 'ISO_8859-8', 'iso-ir-138',
|
||||
'csISOLatinHebrew', 'hebrew']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'he' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,CTR, # BX
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # CX
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,CTR,CTR,SYM,SYM,CTR, # FX
|
||||
]
|
||||
76
script/charsets/iso-8859-9.py
Normal file
76
script/charsets/iso-8859-9.py
Normal file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# ISO-8859-5 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-5.
|
||||
# It is basically the same as ISO/CEI 8859-5, but with control characters.
|
||||
|
||||
name = 'ISO-8859-9'
|
||||
aliases = ['ISO_8859-9:1989', 'ISO_8859-9', 'iso-ir-148',
|
||||
'csISOLatin5', 'latin5', 'l5']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Specifically made to cover Turkish.
|
||||
'complete': [ 'tr' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
74
script/charsets/koi8-r.py
Normal file
74
script/charsets/koi8-r.py
Normal file
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'KOI8-R'
|
||||
aliases = ['csKOI8R']
|
||||
|
||||
language = \
|
||||
{
|
||||
# KOI8-R is an 8-bit character encoding, designed to cover Russian, which
|
||||
# uses a Cyrillic alphabet. It also happens to cover Bulgarian, but has not
|
||||
# been used for that purpose since CP1251 was accepted.
|
||||
'complete': [ 'ru', 'bg' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 8X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 9X
|
||||
SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
72
script/charsets/mac-centraleurope.py
Normal file
72
script/charsets/mac-centraleurope.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'MAC-CENTRALEUROPE'
|
||||
aliases = []
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bs', 'hr', 'cs', 'de', 'hu', 'pl', 'sr', 'sk', 'sl',
|
||||
'hsb', 'dsb', 'tk' ],
|
||||
'incomplete': [ 'ro' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
|
||||
SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,LET,SYM,SYM,LET,LET, # AX
|
||||
LET,LET,SYM,SYM,LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,LET, # CX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,LET,LET, # DX
|
||||
LET,LET,SYM,SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
72
script/charsets/mac-cyrillic.py
Normal file
72
script/charsets/mac-cyrillic.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'MAC-CYRILLIC'
|
||||
aliases = ['x-mac-cyrillic' ]
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'bg', 'ru' ],
|
||||
'incomplete': [ 'uk', 'be' ]
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
|
||||
SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,LET,LET,SYM,LET,LET, # AX
|
||||
SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,LET, # CX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
77
script/charsets/tis-620.py
Normal file
77
script/charsets/tis-620.py
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
# ISO-8859-1 is the full 8-bit range, IANA-defined, superset of ISO/CEI 8859-1.
|
||||
# It is basically the same as ISO/CEI 8859-1, but with control characters.
|
||||
# As far as I can see, `iconv` has no support for the ISO/CEI 8859-1 subset,
|
||||
# so there is no need for us to support it anyway.
|
||||
|
||||
name = 'TIS-620'
|
||||
aliases = []
|
||||
|
||||
language = \
|
||||
{
|
||||
# Designed for Thai language.
|
||||
'complete': ['th'],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
|
||||
ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,ILL,ILL,ILL,SYM, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET, # EX
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,LET,LET,ILL,ILL,ILL,ILL, # FX
|
||||
]
|
||||
72
script/charsets/viscii.py
Normal file
72
script/charsets/viscii.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'VISCII'
|
||||
aliases = ['csVISCII']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Dedicated to Vietnamese.
|
||||
'complete': ['vi'],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,LET,CTR,CTR,LET,LET,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,LET,CTR,CTR,CTR,CTR,LET,CTR,CTR,CTR,CTR,LET,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
75
script/charsets/windows-1250.py
Normal file
75
script/charsets/windows-1250.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1250'
|
||||
aliases = ['cswindows1250']
|
||||
|
||||
language = \
|
||||
{
|
||||
# used under Microsoft Windows to represent texts in Central European and
|
||||
# Eastern European languages that use Latin script, such as Polish, Czech,
|
||||
# Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
|
||||
# Romanian (before 1993 spelling reform) and Albanian.
|
||||
'complete': [ 'pl', 'hu', 'sl', 'bs', 'hr', 'sr', 'ro', 'sq', 'de' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 8X
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 9X
|
||||
SYM,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX
|
||||
SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,LET,SYM,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
75
script/charsets/windows-1251.py
Normal file
75
script/charsets/windows-1251.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1251'
|
||||
aliases = ['CP-1251', 'cswindows1251']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Windows-1251 is a popular 8-bit character encoding, designed to cover
|
||||
# languages that use the Cyrillic script such as Russian, Bulgarian, Serbian
|
||||
# Cyrillic and other languages. It is the most widely used for encoding the
|
||||
# Bulgarian, Serbian and Macedonian languages.
|
||||
'complete': [ 'ru', 'uk', 'be', 'bg', 'sr', 'mk' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
LET,LET,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET, # 8X
|
||||
LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 9X
|
||||
SYM,LET,LET,LET,SYM,LET,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX
|
||||
SYM,SYM,LET,LET,LET,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,LET,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
76
script/charsets/windows-1252.py
Normal file
76
script/charsets/windows-1252.py
Normal file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1252'
|
||||
aliases = ['CP-1252', 'cswindows1252']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Languages with complete coverage.
|
||||
# Basically a mix of ISO-8859-1 and ISO-8859-15.
|
||||
'complete': [ 'af', 'sq', 'eu', 'br', 'co', 'da', 'en', 'fo', 'gl', 'de',
|
||||
'is', 'id', 'it', 'ku', 'leon1250', 'lb', 'ms', 'gv', 'no',
|
||||
'oc', 'pt', 'rm', 'gd', 'es', 'sw', 'sv', 'wa', 'ca', 'et',
|
||||
'fi', 'fr', 'ga', 'la' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,ILL,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,ILL,LET,ILL, # 8X
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,ILL,LET,LET, # 9X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
72
script/charsets/windows-1253.py
Normal file
72
script/charsets/windows-1253.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1253'
|
||||
aliases = ['cswindows1253']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Greek support.
|
||||
'complete': ['el'],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, # 8X
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, # 9X
|
||||
SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,LET,LET,SYM,LET,SYM,LET,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,ILL,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,ILL, # FX
|
||||
]
|
||||
71
script/charsets/windows-1255.py
Normal file
71
script/charsets/windows-1255.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1255'
|
||||
aliases = ['CP1255']
|
||||
|
||||
language = \
|
||||
{
|
||||
'complete': [ 'he' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,CTR,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,CTR,SYM,CTR,CTR,CTR,CTR, # 8X
|
||||
CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,CTR,SYM,CTR,CTR,CTR,CTR, # 9X
|
||||
SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
|
||||
SYM,SYM,SYM,SYM,LET,LET,LET,SYM,SYM,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,CTR,CTR,SYM,SYM,CTR, # FX
|
||||
]
|
||||
75
script/charsets/windows-1256.py
Normal file
75
script/charsets/windows-1256.py
Normal file
@ -0,0 +1,75 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1256'
|
||||
aliases = ['cswindows1256']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Dedicated to Arabic (and possibly some other languages that use Arabic
|
||||
# script, like Persian and Urdu).
|
||||
# Also contains some French characters for colonial historic reasons
|
||||
# (upper-case letters with diacritics were not included).
|
||||
'complete': ['ar', 'fr', 'fa', 'ur'],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET, # 8X
|
||||
LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,SYM,SYM,LET, # 9X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,SYM,SYM,LET, # FX
|
||||
]
|
||||
72
script/charsets/windows-1257.py
Normal file
72
script/charsets/windows-1257.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1257'
|
||||
aliases = ['CP-1257']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Designed to support the Estonian, Latvian and Lithuanian languages.
|
||||
'complete': [ 'et', 'lv', 'lt' ],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,ILL,SYM,ILL,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,SYM, # 8X
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,SYM,SYM,ILL, # 9X
|
||||
SYM,ILL,SYM,SYM,SYM,ILL,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
|
||||
LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,SYM, # FX
|
||||
]
|
||||
72
script/charsets/windows-1258.py
Normal file
72
script/charsets/windows-1258.py
Normal file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
from codepoints import *
|
||||
|
||||
name = 'WINDOWS-1258'
|
||||
aliases = ['cswindows1258']
|
||||
|
||||
language = \
|
||||
{
|
||||
# Dedicated to Vietnamese.
|
||||
'complete': ['vi'],
|
||||
'incomplete': []
|
||||
}
|
||||
|
||||
# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
|
||||
charmap = \
|
||||
[
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
|
||||
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
|
||||
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
|
||||
SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
|
||||
SYM,ILL,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,ILL,ILL,ILL, # 8X
|
||||
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,ILL,ILL,LET, # 9X
|
||||
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX
|
||||
SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET, # CX
|
||||
LET,LET,SYM,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,SYM,LET, # DX
|
||||
LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET, # EX
|
||||
LET,LET,SYM,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
|
||||
]
|
||||
152
script/create-table.py
Executable file
152
script/create-table.py
Executable file
@ -0,0 +1,152 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import optparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
script_path = os.path.relpath(__file__)
|
||||
|
||||
usage = 'Usage: {} <CHARSET-NAME>\n' \
|
||||
'\nEx: `{} ISO-8859-15`'.format(script_path, script_path)
|
||||
|
||||
description = "Internal tool to generate a charset table."
|
||||
cmdline = optparse.OptionParser(usage, description = description)
|
||||
(options, charset) = cmdline.parse_args()
|
||||
if len(charset) != 1:
|
||||
sys.stderr.write("Please choose exactly one charset as argument.\n")
|
||||
exit(1)
|
||||
|
||||
charset = charset[0]
|
||||
|
||||
use_iconv = False
|
||||
try:
|
||||
b' '.decode(charset)
|
||||
dec_version = 'Python {}'.format(sys.version).splitlines()[0]
|
||||
except LookupError:
|
||||
use_iconv = True
|
||||
try:
|
||||
call = subprocess.Popen(['iconv', '--version'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL)
|
||||
if call.poll() is not None:
|
||||
(_, error) = call.communicate(input='')
|
||||
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
|
||||
exit(1)
|
||||
(dec_version, _) = call.communicate(input='')
|
||||
dec_version = dec_version.decode('UTF-8').splitlines()[0]
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write('Error: `iconv` is not installed.\n')
|
||||
exit(1)
|
||||
|
||||
def get_utf8_char(bchar, charset, iconv):
|
||||
if iconv:
|
||||
try:
|
||||
call = subprocess.Popen(['iconv', '-f', charset, '-t', 'UTF-8'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL)
|
||||
if call.poll() is not None:
|
||||
(_, error) = call.communicate(input='')
|
||||
sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
|
||||
exit(1)
|
||||
(uchar, _) = call.communicate(input=bchar)
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write('Error: `iconv` is not installed.\n')
|
||||
exit(1)
|
||||
if len(uchar) > 0:
|
||||
return uchar.decode('UTF-8')
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
try:
|
||||
return bchar.decode(charset)
|
||||
except UnicodeDecodeError:
|
||||
# Typical error:
|
||||
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 0: character maps to <undefined>
|
||||
# It would mean an illegal character.
|
||||
return None
|
||||
|
||||
print('## Table generated by {} with: ##'.format(script_path))
|
||||
print('## {} ##'.format(dec_version))
|
||||
|
||||
print('# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #')
|
||||
print('charmap = \\')
|
||||
sys.stdout.write('[')
|
||||
for l in range(0x10):
|
||||
sys.stdout.write('\n ')
|
||||
has_printable = False
|
||||
for c in range(0x10):
|
||||
char = bytes([c + l * 0x10])
|
||||
char = get_utf8_char(char, charset, use_iconv)
|
||||
if char is None:
|
||||
sys.stdout.write('ILL,')
|
||||
elif char.isalpha():
|
||||
sys.stdout.write('LET,')
|
||||
has_printable = True
|
||||
elif char.isdigit():
|
||||
sys.stdout.write('NUM,')
|
||||
has_printable = True
|
||||
elif char == '\n' or char == '\r':
|
||||
sys.stdout.write('RET,')
|
||||
elif char.isprintable():
|
||||
sys.stdout.write('SYM,')
|
||||
has_printable = True
|
||||
else:
|
||||
sys.stdout.write('CTR,')
|
||||
|
||||
sys.stdout.write(' # {}X'.format(hex(l)[-1:].upper()))
|
||||
|
||||
if has_printable:
|
||||
sys.stdout.write('\n#')
|
||||
# The line has at least one printable character. Print in comment for
|
||||
# debugging.
|
||||
for c in range(0x10):
|
||||
char = bytes([c + l * 0x10])
|
||||
char = get_utf8_char(char, charset, use_iconv)
|
||||
if char is None:
|
||||
sys.stdout.write(' ILL')
|
||||
elif char == '\n' or char == '\r':
|
||||
sys.stdout.write(' RET')
|
||||
elif char.isalpha() or char.isdigit() or char.isprintable():
|
||||
sys.stdout.write(" '{}'".format(char))
|
||||
else:
|
||||
sys.stdout.write(' CTR')
|
||||
sys.stdout.write('\n]')
|
||||
@ -34,5 +34,3 @@
|
||||
* the terms of any one of the MPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "../nsSBCharSetProber.h"
|
||||
|
||||
59
script/langs/ar.py
Normal file
59
script/langs/ar.py
Normal file
@ -0,0 +1,59 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Arabic'
|
||||
code = 'ar'
|
||||
use_ascii = False
|
||||
charsets = ['ISO-8859-6', 'WINDOWS-1256']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# No alphabet. Arabic is complicated because letters have different
|
||||
# forms (glyphs) depending on positions. Some charsets would encode
|
||||
# glyphs while others would encode only the forms. In doubt, I will
|
||||
# just let the defaults for now.
|
||||
|
||||
start_pages = ['الصفحة_الرئيسية']
|
||||
wikipedia_code = code
|
||||
case_mapping = False
|
||||
58
script/langs/be.py
Normal file
58
script/langs/be.py
Normal file
@ -0,0 +1,58 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Belarusian'
|
||||
code = 'be'
|
||||
use_ascii = False
|
||||
charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ]
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
alphabet = 'абвгдеёжзійклмнопрстуўфхцчшыьэюя'
|
||||
# A starred page which was rewarded on the main page when I created
|
||||
# the data.
|
||||
start_pages = ['Максім_Танк']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
58
script/langs/bg.py
Normal file
58
script/langs/bg.py
Normal file
@ -0,0 +1,58 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Bulgarian'
|
||||
code = 'bg'
|
||||
use_ascii = False
|
||||
charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ]
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
alphabet = 'абвгдежзийклмнопрстуфхцчшщъьюя'
|
||||
# A starred page which was rewarded on the main page when I created
|
||||
# the data.
|
||||
start_pages = ['Амурски_леопард']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
79
script/langs/ca.py
Normal file
79
script/langs/ca.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Catalan'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'ca'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-1', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||
# If case_mapping=True, there is no need to add several cases of a same
|
||||
# character (provided Python algorithms know the right cases).
|
||||
alphabet = ['à', 'è', 'é', 'í', 'ï', 'ó', 'ò', 'ú', 'ü', 'ç']
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Parlament_Europeu', 'Genji Monogatari']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
# Note that we are already cleaning away the '=' from the title syntax
|
||||
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||
# some language may return weird syntax or UI text which should be
|
||||
# discarded. If you encounter one of these cases, use this function.
|
||||
def clean_wikipedia_content(content):
|
||||
# Do your garbage text cleaning here.
|
||||
return content
|
||||
80
script/langs/cs.py
Normal file
80
script/langs/cs.py
Normal file
@ -0,0 +1,80 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Czech'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'cs'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-2', 'Windows-1250', 'IBM852', 'MAC-CENTRALEUROPE']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||
# If case_mapping=True, there is no need to add several cases of a same
|
||||
# character (provided Python algorithms know the right cases).
|
||||
alphabet = 'áčďéěíňóřšťúůýž'
|
||||
# The starred page which was rewarded on the main page when I created
|
||||
# the data.
|
||||
start_pages = ['Sociální fobie']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
# Note that we are already cleaning away the '=' from the title syntax
|
||||
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||
# some language may return weird syntax or UI text which should be
|
||||
# discarded. If you encounter one of these cases, use this function.
|
||||
def clean_wikipedia_content(content):
|
||||
# Do your garbage text cleaning here.
|
||||
return content
|
||||
69
script/langs/da.py
Normal file
69
script/langs/da.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Danish'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'da'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252', 'IBM865' ]
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||
# If case_mapping=True, there is no need to add several cases of a same
|
||||
# character (provided Python algorithms know the right cases).
|
||||
alphabet = 'æøå'
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Forside']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
69
script/langs/de.py
Normal file
69
script/langs/de.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'German'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'de'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-1', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||
# If case_mapping=True, there is no need to add several cases of a same
|
||||
# character (provided Python algorithms know the right cases).
|
||||
alphabet = ['ä', 'ö', 'ü', 'ß']
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Deutschland']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
55
script/langs/el.py
Normal file
55
script/langs/el.py
Normal file
@ -0,0 +1,55 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Greek'
|
||||
code = 'el'
|
||||
use_ascii = False
|
||||
charsets = ['ISO-8859-7', 'WINDOWS-1253', 'CP737']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
||||
start_pages = ['Πρωτεύοντα']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
64
script/langs/en.py
Normal file
64
script/langs/en.py
Normal file
@ -0,0 +1,64 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'English'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'en'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-1', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Marmot']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
67
script/langs/eo.py
Normal file
67
script/langs/eo.py
Normal file
@ -0,0 +1,67 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Esperanto'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'eo'
|
||||
# Esperanto actually does use ASCII, but not q, w, x, or y.
|
||||
# So I just use the alphabet variable below instead.
|
||||
use_ascii = False
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-3']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
alphabet = 'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['Vikipedio:Ĉefpaĝo']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
69
script/langs/es.py
Normal file
69
script/langs/es.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
# The human name for the language, in English.
|
||||
name = 'Spanish'
|
||||
# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||
# or use another catalog as a last resort.
|
||||
code = 'es'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# If use_ascii=True, there is no need to add any ASCII characters.
|
||||
# If case_mapping=True, there is no need to add several cases of a same
|
||||
# character (provided Python algorithms know the right cases).
|
||||
alphabet = 'ñáéíóúü'
|
||||
# The start page. Though optional, it is advised to choose one yourself.
|
||||
start_pages = ['España']
|
||||
# give possibility to select another code for the Wikipedia URL.
|
||||
wikipedia_code = code
|
||||
# 'a' and 'A' will be considered the same character, and so on.
|
||||
# This uses Python algorithm to determine upper/lower-case of a given
|
||||
# character.
|
||||
case_mapping = True
|
||||
57
script/langs/et.py
Normal file
57
script/langs/et.py
Normal file
@ -0,0 +1,57 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Estonian'
|
||||
code = 'et'
|
||||
use_ascii = True
|
||||
charsets = ['ISO-8859-4', 'ISO-8859-13', 'ISO-8859-15',
|
||||
'WINDOWS-1252', 'WINDOWS-1257']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
alphabet = 'äöüõšž'
|
||||
start_pages = ['Harilik pohl']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
60
script/langs/fi.py
Normal file
60
script/langs/fi.py
Normal file
@ -0,0 +1,60 @@
|
||||
#!/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ##### BEGIN LICENSE BLOCK #####
|
||||
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
||||
#
|
||||
# The contents of this file are subject to the Mozilla Public License Version
|
||||
# 1.1 (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
# http://www.mozilla.org/MPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing rights and limitations under the
|
||||
# License.
|
||||
#
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Jehan <jehan@girinstud.io>
|
||||
#
|
||||
# Alternatively, the contents of this file may be used under the terms of
|
||||
# either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
# in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
# of those above. If you wish to allow use of your version of this file only
|
||||
# under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
# use your version of this file under the terms of the MPL, indicate your
|
||||
# decision by deleting the provisions above and replace them with the notice
|
||||
# and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
# the provisions above, a recipient may use your version of this file under
|
||||
# the terms of any one of the MPL, the GPL or the LGPL.
|
||||
#
|
||||
# ##### END LICENSE BLOCK #####
|
||||
|
||||
import re
|
||||
|
||||
## Mandatory Properties ##
|
||||
|
||||
name = 'Finnish'
|
||||
code = 'fi'
|
||||
use_ascii = True
|
||||
charsets = ['ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9',
|
||||
'ISO-8859-13', 'ISO-8859-15', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
# Alphabet characters.
|
||||
# 'å' (Swedish o), 'š' and 'ž' are rare enough that I don't want to include them
|
||||
# here.
|
||||
alphabet = 'äö'
|
||||
# Some random high quality page found on the Finnish home page.
|
||||
start_pages = ['Yhdistynyt kuningaskunta']
|
||||
wikipedia_code = code
|
||||
case_mapping = True
|
||||
@ -50,7 +50,7 @@ code = 'fr'
|
||||
# ASCII characters are also used in French.
|
||||
use_ascii = True
|
||||
# The charsets we want to support and create data for.
|
||||
charsets = ['ISO-8859-15', 'ISO-8859-1']
|
||||
charsets = ['ISO-8859-15', 'ISO-8859-1', 'WINDOWS-1252']
|
||||
|
||||
## Optional Properties ##
|
||||
|
||||
@ -70,9 +70,10 @@ case_mapping = True
|
||||
|
||||
# A function to clean content returned by the `wikipedia` python lib,
|
||||
# in case some unwanted data has been overlooked.
|
||||
# Note that we are already cleaning away the '=' from the title syntax
|
||||
# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
|
||||
# some language may return weird syntax or UI text which should be
|
||||
# discarded. If you encounter one of these cases, use this function.
|
||||
def clean_wikipedia_content(content):
|
||||
# We get modify link in the text: "=== Articles connexesModifier ==="
|
||||
cleaned = re.sub(r'(=+) *([^=]+) *Modifier \1',
|
||||
r'\2',
|
||||
content)
|
||||
return cleaned
|
||||
# Do your garbage text cleaning here.
|
||||
return content
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user