diff --git a/doc/Any2Excel 泛读报告.docx b/doc/Any2Excel 泛读报告.docx new file mode 100644 index 0000000..32651b1 Binary files /dev/null and b/doc/Any2Excel 泛读报告.docx differ diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..bcdb1b2 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,11 @@ +images_temp/ +output.xlsx +result.xls +build +test_files/ +config.yml +*.xlsx +*.xls +*.pdf +.vscode/ +__pycache__ \ No newline at end of file diff --git a/src/LICENSE b/src/LICENSE new file mode 100644 index 0000000..33761e9 --- /dev/null +++ b/src/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 绯末 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..098ac65 --- /dev/null +++ b/src/README.md @@ -0,0 +1,115 @@ +
+ +# Any2Excel + +一款使用 Python 编写的图像内表格数据提取工具,可以高效识别 PDF 原件、扫描件、复印件、彩色(黑白)照片、截图内的数据表格,提取后转为 Excel 文件输出。 + +这是一款开源工具,我给它取名叫`Any2Excel`。顾名思义,往后的目标就是提取任意格式文件中的数据图表到可被结构化处理的 Excel 文件。 + +识别度高,操作简单,使用场景广泛。 + +_支持手机拍照、扫描件、原件、复印件等等_ + +# 快速开始 + +## Python PIP 依赖 + +``` +pip install -r .\requirements.txt +``` + +## 外部依赖 + +```cmd +poppler 安装后将其bin路径加入系统变量中 +``` + +## 配置腾讯云 + +`cp config+sample.yml config.yml`后补全`config.yml`中的配置信息。 + +## 工作原理 + +- 将 PDF 按每页转为 JPG 图像文件 + +- 暂时只取 PDF 第一页内容 + +- 提交 OCR 识别这个图像文件 + +- 将识别结果转为 Excel 导出 + +- 清除 Excel 文件的全部样式 + +## 命令行(CLI) + +### PDF 转 Excel + +```shell +cd PDF2Excel +python3 pdf2excel.py test.pdf +``` + +### 图片 转 Excel + +```shell +cd PDF2Excel +python3 image2excel.py capture.jpg +``` + +## 可视化拖拽 + +将需要转换的 PDF 文件/图片文件,拖拽到程序上就会自动执行 + +## 输出文件 + +`*.xlsx` 包含了样式的 Excel 文件,可能会因为样式过多而文件过大。 + +`*.xls` 移除了样式的 Excel 文件,推荐。 + +# 配置 + +`config.yml` 内包含了腾讯云的相关鉴权信息 + +# 演示 + +## 动画 + +### Image To Excel + +![](https://oss.liujunyang.com/blog/images/1.gif) + +### PDF To Excel + +![](https://oss.liujunyang.com/blog/images/2.gif) + +## 截图对比 + +### 原始文件 + +![](https://www.famio.cn/img/posts/14/1.jpg) + +### 提取后 + +![](https://www.famio.cn/img/posts/14/2.jpg) + +# 贡献 + +感谢以下开源项目: + +pdf2image + +PyMuPDF + +PyYAML + +Laravel-Admin + +所有的贡献者都在本项目的贡献清单中。 + +# 安全漏洞 + +如果您在 Any2Excel 中发现安全漏洞,请通过 famio@qq.com 发送电子邮件告知我。 + +# 开源协议 + +遵循 MIT 开源协议。 diff --git a/src/config_sample.yml b/src/config_sample.yml new file mode 100644 index 0000000..0278806 --- /dev/null +++ b/src/config_sample.yml @@ -0,0 +1,2 @@ +secret_id: AKIDd9U12121212121B2hS6YHvtg +secret_key: jGkBUgh1212122112mgR2KahBZllZA diff --git a/src/draw.py b/src/draw.py new file mode 100644 index 0000000..ab028ab --- /dev/null +++ b/src/draw.py @@ -0,0 +1,22 @@ +import fitz +import os +from pdf2image import convert_from_path +# PDF转JPG封装 + + +def pdf2jpg(pdfPath, imagePath): + pdfDoc = fitz.open(pdfPath) + for pg in range(pdfDoc.pageCount): + page = pdfDoc[pg] + rotate = int(0) + # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。 + # 此处若是不做设置,默认图片大小为:792X612, dpi=96 + zoom_x = 4 # (1.33333333-->1056x816) (2-->1584x1224) + zoom_y = 4 + mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) + pix = page.getPixmap(matrix=mat, alpha=False) + + if not os.path.exists(imagePath): # 判断存放图片的文件夹是否存在 + os.makedirs(imagePath) # 若图片文件夹不存在就创建 + + pix.writePNG(imagePath+'/'+'%s.jpg' % pg) # 将图片写入指定的文件夹内 diff --git a/src/image2excel.py b/src/image2excel.py new file mode 100644 index 0000000..e6f504a --- /dev/null +++ b/src/image2excel.py @@ -0,0 +1,30 @@ +import sys +import os +import ocr +import draw +import xlrd +import xlwt +import yaml_class +from xlutils.copy import copy + +pic_path = sys.argv[1] + +pic_name = os.path.split(pic_path)[-1].split(".")[0] + +print('doing') + +# 使用ocr进行转换 +config = yaml_class.get_yaml_data("config.yml") +jpg_name = './images_temp/'+pic_name+'/0.jpg' +trans = ocr.OCR() +path_excel = trans.img_to_excel( + pic_name, + image_path=pic_name, + secret_id=config['secret_id'], + secret_key=config['secret_key'], +) + +old_excel = xlrd.open_workbook('output.xlsx') +new_excel = copy(old_excel) +ws = new_excel.get_sheet(0) +new_excel.save(pic_name+'.xls') diff --git a/src/node_modules/.yarn-integrity b/src/node_modules/.yarn-integrity new file mode 100644 index 0000000..29e4357 --- /dev/null +++ b/src/node_modules/.yarn-integrity @@ -0,0 +1,10 @@ +{ + "systemParams": "win32-x64-83", + "modulesFolders": [], + "flags": [], + "linkedModules": [], + "topLevelPatterns": [], + "lockfileEntries": {}, + "files": [], + "artifacts": {} +} \ No newline at end of file diff --git a/src/ocr.py b/src/ocr.py new file mode 100644 index 0000000..ed9a526 --- /dev/null +++ b/src/ocr.py @@ -0,0 +1,54 @@ +from tencentcloud.common import credential +from tencentcloud.common.profile.client_profile import ClientProfile +from tencentcloud.common.profile.http_profile import HttpProfile +from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException +from tencentcloud.ocr.v20181119 import ocr_client, models + +import base64 + +# OCR识别封装 + + +class OCR(object): + + def img_to_excel(self, + output_file_name, + image_path, + secret_id, + secret_key): + + # 实例化一个认证对象,入参需要传入腾讯云账户secretId,secretKey + cred = credential.Credential( + secret_id, + secret_key + ) + + # 实例化client对象 + httpProfile = HttpProfile() + httpProfile.endpoint = "ocr.tencentcloudapi.com" + clientProfile = ClientProfile() + clientProfile.httpProfile = httpProfile + clientProfile.signMethod = "TC3-HMAC-SHA256" + client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile) + + # 实例化一个请求对象 + req = models.GeneralFastOCRRequest() + + # 读取图片数据,使用Base64编码 + with open(image_path, 'rb') as f: + image = f.read() + image_base64 = str(base64.b64encode(image), encoding='utf-8') + req.ImageBase64 = image_base64 + + # 通过client对象调用访问接口,传入请求对象 + resp = client.TableOCR(req) + + # 获取返回数据(Data为Base64编码后的Excel数据) + data = resp.Data + + # 转换为Excel + output_file_name = str(output_file_name) + path_excel = output_file_name+".xlsx" + with open(path_excel, 'wb') as f: + f.write(base64.b64decode(data)) + return path_excel diff --git a/src/pdf2excel.py b/src/pdf2excel.py new file mode 100644 index 0000000..753d607 --- /dev/null +++ b/src/pdf2excel.py @@ -0,0 +1,33 @@ +import sys +import os +import ocr +import draw +import xlrd +import xlwt +import yaml_class +from xlutils.copy import copy + +pdf_path = sys.argv[1] + +pdf_name = os.path.split(pdf_path)[-1].split(".")[0] + +print('doing') + +# PDF按每页转为JPG文件 +draw.pdf2jpg(pdf_path, './images_temp/'+pdf_name+'/') + +# 使用ocr进行转换 +config = yaml_class.get_yaml_data("config.yml") +jpg_name = './images_temp/'+pdf_name+'/0.jpg' +trans = ocr.OCR() +path_excel = trans.img_to_excel( + pdf_name, + image_path=jpg_name, + secret_id=config['secret_id'], + secret_key=config['secret_key'], +) + +old_excel = xlrd.open_workbook(pdf_name+'.xlsx') +new_excel = copy(old_excel) +ws = new_excel.get_sheet(0) +new_excel.save(pdf_name+'.xls') diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..67d1c83 Binary files /dev/null and b/src/requirements.txt differ diff --git a/src/yaml_class.py b/src/yaml_class.py new file mode 100644 index 0000000..07ca9c9 --- /dev/null +++ b/src/yaml_class.py @@ -0,0 +1,13 @@ +import yaml +import os + + +def get_yaml_data(yaml_file): + # 打开yaml文件 + file = open(yaml_file, 'r', encoding="utf-8") + file_data = file.read() + file.close() + + # 将字符串转化为字典或列表 + data = yaml.load(file_data) + return data diff --git a/src/yarn.lock b/src/yarn.lock new file mode 100644 index 0000000..fb57ccd --- /dev/null +++ b/src/yarn.lock @@ -0,0 +1,4 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + +