python 解析帆软cpt文件-JZTXT

遇到需要对帆软 cpt / frm 文件内的查询语句中用到的库表进行提取的工作，以便仅同步迁移相应库表数据。cpt / frm 格式的文件是帆软报表中特有的数据文件格式，在进行数据报表迁移时，通过导出相应目录资源，即可获得一个压缩包，这个压缩包里包含了两个json文件和一个frReport文件夹，需要解析的库表名称则位于 frReport 文件夹内的子文件夹中。

用本地文本编辑器打开一个cpt格式文件，大概内容如下：

可以看到这是一种 xml 语言，在使用python进行数据解析时，需要用到xml.etree.ElementTree 模块解析。

完整脚本如下：

import pandas as pd
# pip install pypiwin32 / pywin32
# import win32com.client as win32  仅支持 windows 系统
import xml.etree.ElementTree as ET
import os
import re

# 保存待处理的文件夹路径
def read_files_in_directory(directory):
    stack = [directory]  # 使用栈来保存待处理的文件夹路径
    file_paths = []
    while stack:
        current_dir = stack.pop() # 移除并返回列表中的最后一个元素
        for entry in os.scandir(current_dir): # 用于从指定的文件夹中获取包含文件和子文件夹的迭代器
            if entry.is_file():
                # 处理文件
                # print("File:", entry.path)
                file_paths.append(entry.path)
                # stack.append(entry.path)
            elif entry.is_dir():
                # 处理子文件夹，将子文件夹路径加入栈中
                # print("Directory:", entry.path)
                stack.append(entry.path)
    return file_paths


# 文件夹路径
cpt_path = '/Users/测评文件/帆软报表脚本/待解析脚本'

# 获取帆软报表指定格式文件
cpt_file = [f for f in read_files_in_directory(cpt_path) if f.endswith(('.cpt','.frm'))]

database_table_pairs = []  # 用来存放库、表名称
for file_dir in cpt_file:
    with open(file_dir, "r") as file:
        xml_content = file.read()

    root = ET.fromstring(xml_content)
    for database in root.iter("DatabaseName"):
        database_name = database.text.strip()  # 数据库名称
        # i = 0
        for element in root.iter("Query"):
            code = element.text.lower().strip() # 转换为小写
            # print(code)
            # i = i+1
            # print(i)
            words = code.split()
            for i in range(len(words)):
                if words[i] == "from" and words[i+1] not in [" ", "\n", "\t", "(", "(select"]:
                    table_name = words[i+1].split(")")[0]  # 获取 from 关键字后的单词作为表名称
                    database_table_pairs.append((database_name, table_name))
                if words[i] == 'join' and words[i+1] not in [" ", "\n", "\t", "(", "(select"]:
                    table_name = words[i+1].split(")")[0]
                    database_table_pairs.append((database_name, table_name))

    # 列表数据去重
    database_table = list(set(database_table_pairs))


# 保存输出结果到本地
df = pd.DataFrame(database_table, columns=['db_name', 'table_name'])
df.to_excel('/Users/测评文件/帆软报表脚本/0619-解析库表.xlsx', index=False)