mirror of
https://git.minetest.land/VoxeLibre/VoxeLibre.git
synced 2024-12-23 00:19:32 +01:00
add fuzzy matching to translation updater
This commit is contained in:
parent
62ac7fd4b9
commit
7fcb54806f
1 changed files with 43 additions and 2 deletions
|
@ -13,6 +13,13 @@ from sys import argv as _argv
|
||||||
from sys import stderr as _stderr
|
from sys import stderr as _stderr
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Optional jellyfish for fuzzy matching
|
||||||
|
has_jellyfish = False
|
||||||
|
try:
|
||||||
|
import jellyfish
|
||||||
|
has_jellyfish = True
|
||||||
|
except: pass
|
||||||
|
|
||||||
# Running params
|
# Running params
|
||||||
params = {"recursive": False,
|
params = {"recursive": False,
|
||||||
"help": False,
|
"help": False,
|
||||||
|
@ -23,6 +30,7 @@ params = {"recursive": False,
|
||||||
"print-source": False,
|
"print-source": False,
|
||||||
"truncate-unused": False,
|
"truncate-unused": False,
|
||||||
"dofile-order": False,
|
"dofile-order": False,
|
||||||
|
"jellyfish": False,
|
||||||
}
|
}
|
||||||
# Available CLI options
|
# Available CLI options
|
||||||
options = {"recursive": ['--recursive', '-r'],
|
options = {"recursive": ['--recursive', '-r'],
|
||||||
|
@ -32,7 +40,8 @@ options = {"recursive": ['--recursive', '-r'],
|
||||||
"break-long-lines": ['--break-long-lines', '-b'],
|
"break-long-lines": ['--break-long-lines', '-b'],
|
||||||
"print-source": ['--print-source', '-p'],
|
"print-source": ['--print-source', '-p'],
|
||||||
"truncate-unused": ['--truncate-unused', '-t'],
|
"truncate-unused": ['--truncate-unused', '-t'],
|
||||||
"dofile-order": ['--dofile-order', '-d']
|
"dofile-order": ['--dofile-order', '-d'],
|
||||||
|
"jellyfish": ['--jellyfish', '-j'],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Strings longer than this will have extra space added between
|
# Strings longer than this will have extra space added between
|
||||||
|
@ -89,6 +98,8 @@ DESCRIPTION
|
||||||
delete unused strings from files
|
delete unused strings from files
|
||||||
{', '.join(options["dofile-order"])}
|
{', '.join(options["dofile-order"])}
|
||||||
try to order files by their order from init.lua (not recursive)
|
try to order files by their order from init.lua (not recursive)
|
||||||
|
{', '.join(options["jellyfish"])}
|
||||||
|
use jellyfish library for fuzzy matching
|
||||||
''')
|
''')
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -223,6 +234,18 @@ def get_existing_tr_files(folder):
|
||||||
out.append(name)
|
out.append(name)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def fuzzy_match(s, candidates):
|
||||||
|
import math
|
||||||
|
if not has_jellyfish: raise "The jellyfish library is not installed."
|
||||||
|
if len(candidates) == 0 or len(s) < 5: return None
|
||||||
|
scores = sorted((jellyfish.damerau_levenshtein_distance(s, c) / max(len(s), len(c)), c) for c in candidates)
|
||||||
|
thresh = 0.2 if len(s) >= 16 else 0.8 / math.sqrt(len(s))
|
||||||
|
if scores[0][0] > thresh: return None
|
||||||
|
if len(scores) > 1 and scores[1][0] <= thresh:
|
||||||
|
print("Ambiguous fuzzy match:", s, "<=>", scores[0][1], "<=>", scores[1][1])
|
||||||
|
return None # ambiguous
|
||||||
|
return scores[0][1]
|
||||||
|
|
||||||
# Converts the template dictionary to a text to be written as a file
|
# Converts the template dictionary to a text to be written as a file
|
||||||
# dGroupedKeyStrings is a dictionary of source file sets to localized strings
|
# dGroupedKeyStrings is a dictionary of source file sets to localized strings
|
||||||
# dOld is a dictionary of existing translations and comments from
|
# dOld is a dictionary of existing translations and comments from
|
||||||
|
@ -240,6 +263,9 @@ def strings_to_text(dGroupedKeyStrings, dOld, mod_name, header_comments, textdom
|
||||||
if header_comments is not None:
|
if header_comments is not None:
|
||||||
lOut.append(header_comments)
|
lOut.append(header_comments)
|
||||||
|
|
||||||
|
dOldStrings = set(dOld.keys())
|
||||||
|
usedFuzzy = set()
|
||||||
|
|
||||||
for source, localizedStrings in dGroupedKeyStrings.items():
|
for source, localizedStrings in dGroupedKeyStrings.items():
|
||||||
if params["print-source"] and len(source) > 0:
|
if params["print-source"] and len(source) > 0:
|
||||||
lOut.append(symbol_source_prefix + " ".join(x.replace("r\\","/") for x in source) + symbol_source_suffix)
|
lOut.append(symbol_source_prefix + " ".join(x.replace("r\\","/") for x in source) + symbol_source_suffix)
|
||||||
|
@ -251,6 +277,21 @@ def strings_to_text(dGroupedKeyStrings, dOld, mod_name, header_comments, textdom
|
||||||
if templ:
|
if templ:
|
||||||
templ_val = templ[0].get(localizedString, {})
|
templ_val = templ[0].get(localizedString, {})
|
||||||
templ_comment = templ_val.get("comment")
|
templ_comment = templ_val.get("comment")
|
||||||
|
|
||||||
|
# fuzzy matching:
|
||||||
|
if translation == "" and params["jellyfish"] and localizedString not in dOldStrings:
|
||||||
|
cand = fuzzy_match(localizedString, dOldStrings)
|
||||||
|
if cand and cand in dOld:
|
||||||
|
val = dOld.get(cand)
|
||||||
|
translation = val.get("translation", "")
|
||||||
|
if translation != "":
|
||||||
|
usedFuzzy.add(cand)
|
||||||
|
comment = val.get("comment")
|
||||||
|
if not comment or comment == "":
|
||||||
|
comment = "##TODO: fuzzy matched - verify and remove the comment"
|
||||||
|
else:
|
||||||
|
comment = comment + " ##TODO: fuzzy matched - verify and remove the comment"
|
||||||
|
|
||||||
if params["break-long-lines"] and len(localizedString) > doublespace_threshold and not lOut[-1] == "":
|
if params["break-long-lines"] and len(localizedString) > doublespace_threshold and not lOut[-1] == "":
|
||||||
lOut.append("")
|
lOut.append("")
|
||||||
if templ_comment != None and templ_comment != "" and (comment is None or comment == "" or not comment.startswith(templ_comment)):
|
if templ_comment != None and templ_comment != "" and (comment is None or comment == "" or not comment.startswith(templ_comment)):
|
||||||
|
@ -267,7 +308,7 @@ def strings_to_text(dGroupedKeyStrings, dOld, mod_name, header_comments, textdom
|
||||||
unusedExist = False
|
unusedExist = False
|
||||||
if not params["truncate-unused"]:
|
if not params["truncate-unused"]:
|
||||||
for key in dOld:
|
for key in dOld:
|
||||||
if key not in dkeyStrings:
|
if key not in dkeyStrings and not key in usedFuzzy:
|
||||||
val = dOld[key]
|
val = dOld[key]
|
||||||
translation = val.get("translation")
|
translation = val.get("translation")
|
||||||
comment = val.get("comment")
|
comment = val.get("comment")
|
||||||
|
|
Loading…
Reference in a new issue