ScreenTranslator/share/updates/tessdata.py

69 lines
2.0 KiB
Python
Raw Permalink Normal View History

2020-03-15 18:10:26 +07:00
import sys
import os
import subprocess
2020-03-18 02:15:40 +07:00
import re
def parse_language_names():
2020-04-05 17:06:08 +07:00
root = os.path.abspath(os.path.basename(__file__) + '/../../..')
2020-03-18 02:15:40 +07:00
lines = []
with open(root + '/src/languagecodes.cpp', 'r') as f:
lines = f.readlines()
result = {}
for line in lines:
all = re.findall(r'"(.*?)"', line)
if len(all) != 6:
continue
result[all[3]] = all[5]
return result
2020-03-15 18:10:26 +07:00
if len(sys.argv) < 2:
print("Usage:", sys.argv[0], "<tessdata_dir> [<download_url>]")
exit(1)
tessdata_dir = sys.argv[1]
download_url = "https://github.com/tesseract-ocr/tessdata_best/raw/master"
if len(sys.argv) > 2:
download_url = sys.argv[2]
2020-04-19 16:46:58 +07:00
mirror_url = "https://translator.gres.biz/resources/tessdata_best"
2020-03-18 02:15:40 +07:00
language_names = parse_language_names()
2020-03-15 18:10:26 +07:00
files = {}
2020-04-05 17:06:08 +07:00
it = os.scandir(tessdata_dir)
for f in it:
if not f.is_file() or f.name in ["LICENSE", "README.md"]:
continue
name = f.name[:f.name.index('.')]
2020-04-15 01:15:55 +07:00
if len(name) == 0:
continue
2020-04-05 17:06:08 +07:00
files.setdefault(name, []).append(f.name)
2020-03-15 18:10:26 +07:00
print(',"recognizers": {')
comma = ''
2020-03-18 02:15:40 +07:00
unknown_names = []
2020-04-12 21:25:36 +07:00
for name in sorted(files.keys()):
file_names = files[name]
2020-03-18 02:15:40 +07:00
if not name in language_names:
unknown_names.append(name)
else:
name = language_names[name]
2020-03-15 18:10:26 +07:00
print(' {}"{}":{{"files":['.format(comma, name))
comma = ', '
for file_name in file_names:
git_cmd = ['git', 'log', '-1', '--pretty=format:%cI', file_name]
date = subprocess.run(git_cmd, cwd=tessdata_dir, universal_newlines=True,
stdout=subprocess.PIPE, check=True).stdout
2020-04-05 17:06:08 +07:00
size = os.path.getsize(os.path.join(tessdata_dir, file_name))
2020-04-19 16:46:58 +07:00
mirror = ',"' + mirror_url + '/' + file_name + \
'.zip"' if len(mirror_url) > 0 else ''
print(' {{"url":["{}/{}"{}], "path":"$tessdata$/{}", "date":"{}", "size":{}}}'.format(
download_url, file_name, mirror, file_name, date, size))
2020-03-15 18:10:26 +07:00
print(' ]}')
print('}')
2020-03-18 02:15:40 +07:00
print('unknown names', unknown_names)