替换掉tex文件的关键字以便于翻译

发布时间 2023-05-27 15:36:54作者: Isakovsky

遇到的问题:找到了一篇文献的Tex文件,尝试使用翻译软件翻译成中文以便于阅读,但机翻会极其智障地把不该翻译的也给翻译了,比如

\begin{document}

给翻译成了

\开始{文档}

因此,尝试使用正则表达式将Tex中没有必要翻译的关键字与公式给替换掉

(1)将关键字替换掉

#将Tex文件的关键字及公式替换掉并保存下来,防止被翻译软件翻译掉
import os
import re
replaceDict=[
    r'(\$\$[^\$]+\$\$)', #一对$$符号中间夹着的数学表达式
    r'(\$[^\$]+\$)', #一对$符号中间夹着的数学表达式
    r'(\\[a-zA-Z]+{[^{}]+})', #\xxx{xxx}形式的命令
    r'(\\[a-zA-Z]+)' #\xxx形式的命令
]
expressDict=[
    r'(\\begin{equation}|\\end{equation})',
    r'(\\begin{equation\*}|\\end{equation\*})',
    r'(\\begin{align}|\\end{align})',
    r'(\\begin{align\*}|\\end{align\*})',
    r'(\\begin{table}|\\end{table})',
    r'(\\begin{table\*}|\\end{table\*})',
    r'(\\\[|\\])',
    r'([|])',
    r'({|})',
]
# 注意,如果不在字符串前面使用r符号,则反斜杠会被Python解释器转义一次,再被正则表达式转义一次
# 注意,右中括号不需要转义
# 注意,当大括号内容不含数字时大括号不需要转义
# 吐槽,正则表达式的可读性和可维护性为0
replaceChar='の'#用于标记被替换掉的关键字的字符,最好是原文里没有且不会被翻译软件读取并改变的
splitChar='ん'#用于在存储关键字临时文件中分隔
inputFileName='main.tex'
outputFileName='out.tex'
tempFileName='temp.txt'
textContent=open(inputFileName,'r',encoding='utf-8').read()

replacedContent=[]

# 替换关键字部分
with open(tempFileName,'w',encoding='utf-8') as ftempout:
    replaceNum=0
    # 替换掉导言区
    pattern=re.compile(r'(\\begin{document})')
    s=pattern.split(textContent)
    ftempout.write(s[0]+s[1]+splitChar)
    s[0]=replaceChar+str(replaceNum)+replaceChar
    s[1]=''
    replaceNum=replaceNum+1
    textContent=''.join(s)

    # 替换掉数学表达式
    for w in expressDict:
        pattern=re.compile(w)
        s=pattern.split(textContent)
        # for j in range(len(s)):
        #     ftempout.write("\n%%%%\n"+s[j]+"\n%%%%\n")
        # exit()
        j=0
        while j < len(s):
            if pattern.match(s[j]) != None:
                print(s[j]+s[j+1]+s[j+2])
                ftempout.write(s[j]+s[j+1]+s[j+2]+splitChar)
                replacedContent.append(s[j]+s[j+1]+s[j+2])
                s[j]=replaceChar
                s[j+1]=str(replaceNum)
                s[j+2]=replaceChar
                replaceNum=replaceNum+1
                j=j+2
            j=j+1
        textContent=''.join(s)
        # 注意,Python中的for in range()循环的循环变量无法像C++那样在循环中改变

    for w in replaceDict:
        pattern=re.compile(w)
        s=pattern.split(textContent)
        for j in range(len(s)):
            if pattern.match(s[j]) != None:
                print(s[j])
                ftempout.write(s[j]+splitChar)
                s[j]=replaceChar+str(replaceNum)+replaceChar
                replaceNum=replaceNum+1
        textContent=''.join(s)
with open(outputFileName,'w',encoding='utf-8') as fout:
    fout.write(textContent)

(2)替换回来

#将Tex文件的关键字及公式替换掉并保存下来,防止被翻译软件翻译掉
import os
import re
replaceChar='の'#用于标记被替换掉的关键字的字符,最好是原文里没有且不会被翻译软件读取并改变的
splitChar='ん'#用于在存储关键字临时文件中分隔
inputFileName='main.tex'
outputFileName='out.tex'
translatedFileName='transed.tex'
tempFileName='temp.txt'
finalOutputFileName='out2.tex'
replacedContent=re.split(splitChar,open(tempFileName,'r',encoding='utf-8').read())
# print(replacedContent)
textContent=open(translatedFileName,'r',encoding='utf-8').read()
contentPattern='('+replaceChar+r'[0-9]+'+replaceChar+')'
pattern=pattern=re.compile(contentPattern)
while re.search(replaceChar,textContent)!= None:
    s=re.split(contentPattern,textContent)
    for j in range(len(s)):
        if pattern.match(s[j])!= None:
            # print(int(s[j][1:-1]),replacedContent[int(s[j][1:-1])])
            s[j]=replacedContent[int(s[j][1:-1])]
    textContent=' '.join(s) #不加空格的话关键字和文本可能会粘连在一起造成编译错误
with open(finalOutputFileName,'w',encoding='utf-8') as fout:
    fout.write(textContent)