1.ImageMagick
2.tesseract-OCR
3.Python3.7
4.from PIL import Image as PI
5.import io
6.import os
7.import pyocr.builders
8.from cnocr import CnOcr
# 9.import xlwt
分析上圖發現票據金額為“貳拾萬元整”,資料金額為大寫中文,因此在導入Excel之前我們需要將金額票據的資料轉換成數字的格式,基於此,我們需要先完成大寫漢字和數字的轉換。
def chineseNumber2Int(strNum: str): result = 0 temp = 1 # 存放一个单位的数字如:十万 count = 0 # 判断是否有chArr cnArr = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖'] chArr = ['拾', '佰', '仟', '万', '亿'] for i in range(len(strNum)): b = True c = strNum[i] for j in range(len(cnArr)): if c == cnArr[j]: if count != 0: result += temp count = 0 temp = j + 1 b = False break if b: for j in range(len(chArr)): if c == chArr[j]: if j == 0: temp *= 10 elif j == 1: temp *= 100 elif j == 2: temp *= 1000 elif j == 3: temp *= 10000 elif j == 4: temp *= 100000000 count += 1 if i == len(strNum) - 1: result += temp return result
透過上述程式碼即可實現大寫字母與數字的轉換,例如輸入“貳拾萬元整”即可導出“200000”,再將其轉換成數字後即可極大地簡化表格的操作,也可以在完成表格操作的同時有利於資料歸檔。
接下來,我們需要分析發票的內部內容,分析下圖可知,我們需要取得以下幾個資料內容:「出票日期」、「匯票到帳日期」、「票據號碼」、「收款人”、“票據金額”、“出票人”,可透過畫圖軟體取得精準定位。
如圖,小黑點即滑鼠所在地,畫圖軟體左下角即他的座標。
def text1(new_img): #提取出票日期 left = 80 top = 143 right = 162 bottom = 162 image_text1 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text1.show() txt1 = tool.image_to_string(image_text1) print(txt1) return str(txt1)
def text2(new_img): #提取金额 left = 224 top = 355 right = 585 bottom = 380 image_text2 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text2.show() image_text2.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") temp="".join(temp[0]) txt2=chineseNumber2Int(temp) print(txt2) return txt2
def text3(new_img): #提取出票人 left = 177 top = 207 right = 506 bottom = 231 image_text3 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text3.show() image_text3.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt3="".join(temp[0]) print(txt3) return txt3
def text4(new_img): #提取付款行 left = 177 top = 274 right = 492 bottom = 311 image_text4 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text4.show() image_text4.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt4="".join(temp[0]) print(txt4) return txt4
def text5(new_img): #提取汇票到日期 left = 92 top = 166 right = 176 bottom = 184 image_text5 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text5.show() txt5 = tool.image_to_string(image_text5) print(txt5) return txt5
def text6(new_img): #提取票据号码 left = 598 top = 166 right = 870 bottom = 182 image_text6 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text6.show() txt6 = tool.image_to_string(image_text6) print(txt6) return txt6
在將資料全部提取完成之後,即進入設定環節,我們需要先將所有帳單檔案進行提取,取得他們的檔案名稱和路徑。
ocr=CnOcr() tool = pyocr.get_available_tools()[0] filePath='img' img_name=[] for i,j,name in os.walk(filePath): img_name=name
在取得完整後,即可進行資料導入Excel的操作。
count=1 book = xlwt.Workbook(encoding='utf-8',style_compression=0) sheet = book.add_sheet('test',cell_overwrite_ok=True) for i in img_name: img_url = filePath+"/"+i with open(img_url, 'rb') as f: a = f.read() new_img = PI.open(io.BytesIO(a)) ## 写入csv col = ('年份','出票日期','金额','出票人','付款行全称','汇票到日期','备注') for j in range(0,7): sheet.write(0,j,col[j]) book.save('1.csv') shijian=text1(new_img) sheet.write(count,0,shijian[0:4]) sheet.write(count,1,shijian[5:]) sheet.write(count,2,text2(new_img)) sheet.write(count,3,text3(new_img)) sheet.write(count,4,text4(new_img)) sheet.write(count,5,text5(new_img)) sheet.write(count,6,text6(new_img)) count = count + 1
至此,完整流程結束。
from wand.image import Image from PIL import Image as PI import pyocr import io import re import os import shutil import pyocr.builders from cnocr import CnOcr import requests import xlrd import xlwt from openpyxl import load_workbook def chineseNumber2Int(strNum: str): result = 0 temp = 1 # 存放一个单位的数字如:十万 count = 0 # 判断是否有chArr cnArr = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖'] chArr = ['拾', '佰', '仟', '万', '亿'] for i in range(len(strNum)): b = True c = strNum[i] for j in range(len(cnArr)): if c == cnArr[j]: if count != 0: result += temp count = 0 temp = j + 1 b = False break if b: for j in range(len(chArr)): if c == chArr[j]: if j == 0: temp *= 10 elif j == 1: temp *= 100 elif j == 2: temp *= 1000 elif j == 3: temp *= 10000 elif j == 4: temp *= 100000000 count += 1 if i == len(strNum) - 1: result += temp return result def text1(new_img): #提取出票日期 left = 80 top = 143 right = 162 bottom = 162 image_text1 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text1.show() txt1 = tool.image_to_string(image_text1) print(txt1) return str(txt1) def text2(new_img): #提取金额 left = 224 top = 355 right = 585 bottom = 380 image_text2 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text2.show() image_text2.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") temp="".join(temp[0]) txt2=chineseNumber2Int(temp) print(txt2) return txt2 def text3(new_img): #提取出票人 left = 177 top = 207 right = 506 bottom = 231 image_text3 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text3.show() image_text3.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt3="".join(temp[0]) print(txt3) return txt3 def text4(new_img): #提取付款行 left = 177 top = 274 right = 492 bottom = 311 image_text4 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text4.show() image_text4.save("img/tmp.png") temp = ocr.ocr("img/tmp.png") txt4="".join(temp[0]) print(txt4) return txt4 def text5(new_img): #提取汇票到日期 left = 92 top = 166 right = 176 bottom = 184 image_text5 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text5.show() txt5 = tool.image_to_string(image_text5) print(txt5) return txt5 def text6(new_img): #提取票据号码 left = 598 top = 166 right = 870 bottom = 182 image_text6 = new_img.crop((left, top, right, bottom)) #展示图片 #image_text6.show() txt6 = tool.image_to_string(image_text6) print(txt6) return txt6 ocr=CnOcr() tool = pyocr.get_available_tools()[0] filePath='img' img_name=[] for i,j,name in os.walk(filePath): img_name=name count=1 book = xlwt.Workbook(encoding='utf-8',style_compression=0) sheet = book.add_sheet('test',cell_overwrite_ok=True) for i in img_name: img_url = filePath+"/"+i with open(img_url, 'rb') as f: a = f.read() new_img = PI.open(io.BytesIO(a)) ## 写入csv col = ('年份','出票日期','金额','出票人','付款行全称','汇票到日期','备注') for j in range(0,7): sheet.write(0,j,col[j]) book.save('1.csv') shijian=text1(new_img) sheet.write(count,0,shijian[0:4]) sheet.write(count,1,shijian[5:]) sheet.write(count,2,text2(new_img)) sheet.write(count,3,text3(new_img)) sheet.write(count,4,text4(new_img)) sheet.write(count,5,text5(new_img)) sheet.write(count,6,text6(new_img)) count = count + 1
以上是怎麼透過Python實現批量資料擷取的詳細內容。更多資訊請關注PHP中文網其他相關文章!