main
HelenMay 3 days ago
parent 5f4775ef2d
commit faec49b807

Binary file not shown.

11
src/.gitignore vendored

@ -0,0 +1,11 @@
images_temp/
output.xlsx
result.xls
build
test_files/
config.yml
*.xlsx
*.xls
*.pdf
.vscode/
__pycache__

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 绯末
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,115 @@
<div align=center><img style="width:60%;" src="https://i.loli.net/2020/08/19/21HPdw8r94XFlgz.png" ></div>
# Any2Excel
一款使用 Python 编写的图像内表格数据提取工具,可以高效识别 PDF 原件、扫描件、复印件、彩色(黑白)照片、截图内的数据表格,提取后转为 Excel 文件输出。
这是一款开源工具,我给它取名叫`Any2Excel`。顾名思义,往后的目标就是提取任意格式文件中的数据图表到可被结构化处理的 Excel 文件。
识别度高,操作简单,使用场景广泛。
_支持手机拍照、扫描件、原件、复印件等等_
# 快速开始
## Python PIP 依赖
```
pip install -r .\requirements.txt
```
## 外部依赖
```cmd
poppler 安装后将其bin路径加入系统变量中
```
## 配置腾讯云
`cp config+sample.yml config.yml`后补全`config.yml`中的配置信息。
## 工作原理
- 将 PDF 按每页转为 JPG 图像文件
- 暂时只取 PDF 第一页内容
- 提交 OCR 识别这个图像文件
- 将识别结果转为 Excel 导出
- 清除 Excel 文件的全部样式
## 命令行CLI
### PDF 转 Excel
```shell
cd PDF2Excel
python3 pdf2excel.py test.pdf
```
### 图片 转 Excel
```shell
cd PDF2Excel
python3 image2excel.py capture.jpg
```
## 可视化拖拽
将需要转换的 PDF 文件/图片文件,拖拽到程序上就会自动执行
## 输出文件
`*.xlsx` 包含了样式的 Excel 文件,可能会因为样式过多而文件过大。
`*.xls` 移除了样式的 Excel 文件,推荐。
# 配置
`config.yml` 内包含了腾讯云的相关鉴权信息
# 演示
## 动画
### Image To Excel
![](https://oss.liujunyang.com/blog/images/1.gif)
### PDF To Excel
![](https://oss.liujunyang.com/blog/images/2.gif)
## 截图对比
### 原始文件
![](https://www.famio.cn/img/posts/14/1.jpg)
### 提取后
![](https://www.famio.cn/img/posts/14/2.jpg)
# 贡献
感谢以下开源项目:
pdf2image
PyMuPDF
PyYAML
Laravel-Admin
所有的贡献者都在本项目的贡献清单中。
# 安全漏洞
如果您在 Any2Excel 中发现安全漏洞,请通过 famio@qq.com 发送电子邮件告知我。
# 开源协议
遵循 MIT 开源协议。

@ -0,0 +1,2 @@
secret_id: AKIDd9U12121212121B2hS6YHvtg
secret_key: jGkBUgh1212122112mgR2KahBZllZA

@ -0,0 +1,22 @@
import fitz
import os
from pdf2image import convert_from_path
# PDF转JPG封装
def pdf2jpg(pdfPath, imagePath):
pdfDoc = fitz.open(pdfPath)
for pg in range(pdfDoc.pageCount):
page = pdfDoc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为1.3这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置默认图片大小为792X612, dpi=96
zoom_x = 4 # (1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 4
mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
pix = page.getPixmap(matrix=mat, alpha=False)
if not os.path.exists(imagePath): # 判断存放图片的文件夹是否存在
os.makedirs(imagePath) # 若图片文件夹不存在就创建
pix.writePNG(imagePath+'/'+'%s.jpg' % pg) # 将图片写入指定的文件夹内

@ -0,0 +1,30 @@
import sys
import os
import ocr
import draw
import xlrd
import xlwt
import yaml_class
from xlutils.copy import copy
pic_path = sys.argv[1]
pic_name = os.path.split(pic_path)[-1].split(".")[0]
print('doing')
# 使用ocr进行转换
config = yaml_class.get_yaml_data("config.yml")
jpg_name = './images_temp/'+pic_name+'/0.jpg'
trans = ocr.OCR()
path_excel = trans.img_to_excel(
pic_name,
image_path=pic_name,
secret_id=config['secret_id'],
secret_key=config['secret_key'],
)
old_excel = xlrd.open_workbook('output.xlsx')
new_excel = copy(old_excel)
ws = new_excel.get_sheet(0)
new_excel.save(pic_name+'.xls')

10
src/node_modules/.yarn-integrity generated vendored

@ -0,0 +1,10 @@
{
"systemParams": "win32-x64-83",
"modulesFolders": [],
"flags": [],
"linkedModules": [],
"topLevelPatterns": [],
"lockfileEntries": {},
"files": [],
"artifacts": {}
}

@ -0,0 +1,54 @@
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
import base64
# OCR识别封装
class OCR(object):
def img_to_excel(self,
output_file_name,
image_path,
secret_id,
secret_key):
# 实例化一个认证对象入参需要传入腾讯云账户secretIdsecretKey
cred = credential.Credential(
secret_id,
secret_key
)
# 实例化client对象
httpProfile = HttpProfile()
httpProfile.endpoint = "ocr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
clientProfile.signMethod = "TC3-HMAC-SHA256"
client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)
# 实例化一个请求对象
req = models.GeneralFastOCRRequest()
# 读取图片数据使用Base64编码
with open(image_path, 'rb') as f:
image = f.read()
image_base64 = str(base64.b64encode(image), encoding='utf-8')
req.ImageBase64 = image_base64
# 通过client对象调用访问接口传入请求对象
resp = client.TableOCR(req)
# 获取返回数据Data为Base64编码后的Excel数据
data = resp.Data
# 转换为Excel
output_file_name = str(output_file_name)
path_excel = output_file_name+".xlsx"
with open(path_excel, 'wb') as f:
f.write(base64.b64decode(data))
return path_excel

@ -0,0 +1,33 @@
import sys
import os
import ocr
import draw
import xlrd
import xlwt
import yaml_class
from xlutils.copy import copy
pdf_path = sys.argv[1]
pdf_name = os.path.split(pdf_path)[-1].split(".")[0]
print('doing')
# PDF按每页转为JPG文件
draw.pdf2jpg(pdf_path, './images_temp/'+pdf_name+'/')
# 使用ocr进行转换
config = yaml_class.get_yaml_data("config.yml")
jpg_name = './images_temp/'+pdf_name+'/0.jpg'
trans = ocr.OCR()
path_excel = trans.img_to_excel(
pdf_name,
image_path=jpg_name,
secret_id=config['secret_id'],
secret_key=config['secret_key'],
)
old_excel = xlrd.open_workbook(pdf_name+'.xlsx')
new_excel = copy(old_excel)
ws = new_excel.get_sheet(0)
new_excel.save(pdf_name+'.xls')

Binary file not shown.

@ -0,0 +1,13 @@
import yaml
import os
def get_yaml_data(yaml_file):
# 打开yaml文件
file = open(yaml_file, 'r', encoding="utf-8")
file_data = file.read()
file.close()
# 将字符串转化为字典或列表
data = yaml.load(file_data)
return data

@ -0,0 +1,4 @@
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
# yarn lockfile v1
Loading…
Cancel
Save