autotranslate.py

# A quick-and-dirty script to run untranslated text through Google Translate's API.
# The result will likely include comical errors a native speaker will laugh at you for
# or that will puzzle them, and some manual correction of escaped codes such as @1 and @= may be
# required, but hopefully it will serve as a start to something useful

# Copyright (C) 2020 FaceDeer
# LGPLv2.1+

# See https://github.com/minetest-tools/update_translations for
# potential future updates to this script.

from googletrans import Translator, LANGUAGES
import os, re, shutil

pattern_tr_filename = re.compile(r'\.tr$')
pattern_tr_id = re.compile(r'\.([^.]*)\.tr$')
pattern_line_to_translate = re.compile(r'^([^#].*[^@])=$') #finds lines that don't have a translation

translator = Translator()

def translate(tr_filename):
    lang_id = pattern_tr_id.search(tr_filename)
    if not lang_id:
        print("Could not find language ID in tr filename " + tr_filename)
        return

    lang_id = lang_id.group(1)

    if not lang_id in LANGUAGES:
        print("language ID " + lang_id + " is not supported by Google Translate's API")
        return

    lines_to_translate = [] # this list of strings will ultimately be sent to Google for translation
    with open(tr_filename, "r", encoding="utf-8") as tr_file_handle:
        for line in tr_file_handle:
            # Look for lines that end in "=", ie, that don't have a valid translation added to them
            line_lacking_translation = pattern_line_to_translate.search(line)
            if line_lacking_translation:
                #break the line up at @n markers, this is not ideal for Google
                #as it may remove some context but it's necessary to allow the
                #@n markers to be preserved in the output later
                lines_to_translate = lines_to_translate + line_lacking_translation.group(1).split("@n")

        # Remove duplicates, and the empty string (a common artefact of splitting)
        line_set = set(lines_to_translate)
        line_set.discard("")
        lines_to_translate = list(line_set)

        # Only do more work if there are lines in need of translation
        if lines_to_translate:
            print("Calling Google API for " + tr_filename)
            output = translator.translate(lines_to_translate, src="en", dest=lang_id)

            #convert the output translations into a dictionary for easy substitution later
            translation_dictionary = dict()
            for out_line in output:
                #Google's API sometimes seems to fail to translate a line for no apparent reason
                #Don't put them in the dictionary, we can leave those untranslated and maybe try again
                if out_line.origin != out_line.text:
                    translation_dictionary[out_line.origin] = out_line.text

            translation_dictionary["@n"] = "@n" #These are to be left unchanged

            tr_file_handle.seek(0)
            with open(tr_filename + ".temp", "w", encoding="utf-8") as tr_file_new:
                for line in tr_file_handle:
                    line_lacking_translation = pattern_line_to_translate.search(line)
                    if line_lacking_translation:
                        line = line.rstrip('\n') #remove trailing newline so we can add the translated string to the same line
                        line_split = re.split("(@n)", line[:-1]) #slice to leave off the "=" that's the last character of the line
                        translated_line = ""
                        
                        #After splitting the line up on @n again, as was done before, we should have
                        #line segments that match the strings that were sent to Google.
                        for line_piece in line_split:
                            if line_piece in translation_dictionary:
                                translated_line = translated_line + translation_dictionary[line_piece]
                            else:
                                print("Google returned string unchanged in file " + tr_filename + ":")
                                print(line_piece)
                                translated_line = None
                                break

                        if translated_line:
                            tr_file_new.write("#WARNING: AUTOTRANSLATED BY GOOGLE TRANSLATE\n")
                            tr_file_new.write(line)
                            tr_file_new.write(translated_line)
                            tr_file_new.write("\n")
                        else:
                            tr_file_new.write(line)
                            tr_file_new.write("\n")                            
                    else:
                        tr_file_new.write(line)
            shutil.move(tr_filename + ".temp", tr_filename) # Overwrite the original file with the new one

pattern_domain = re.compile(r'^# textdomain: (.+)$')

def create_tr_files_from_template(folder, lang_id):
    if not lang_id in LANGUAGES:
        print("language ID " + lang_id + " is not supported by Google Translate's API")
        return
    for root, dirs, files in os.walk(folder):
        if root == "." or os.path.split(root)[1] == "locale":
            for name in files:
                if name == "template.txt":
                    template_filename = os.path.join(root,name)
                    with open(template_filename, "r", encoding="utf-8") as template_file:
                        first_line = template_file.readline()
                        domain = pattern_domain.search(first_line)
                        if domain:
                            translation_filename = domain.group(1) + "." + lang_id + ".tr"
                            translation_filename = os.path.join(root,translation_filename)
                            if not os.path.isfile(translation_filename):
                                print("Copying template.txt to " + translation_filename)
                                shutil.copy(template_filename, translation_filename)
                            else:
                                print(translation_filename + " already exists")

#If there are already .tr files in /locale, returns a list of their names
def get_existing_tr_files(folder):
    out = []
    for root, dirs, files in os.walk(folder):
        for name in files:
            if pattern_tr_filename.search(name):
                out.append(os.path.join(root,name))
    return out

#create_tr_files_from_template(".", "de")
#create_tr_files_from_template(".", "it")

tr_files = get_existing_tr_files(".")
for tr_file in tr_files:
    translate(tr_file)