首頁 >後端開發 >Python教學 >怎麼透過Python實現批量資料擷取

怎麼透過Python實現批量資料擷取

王林轉載: 2023-04-29 21:16:051967瀏覽

配置需求

1.ImageMagick

2.tesseract-OCR

3.Python3.7

4.from PIL import Image as PI

5.import io

6.import os

7.import pyocr.builders

8.from cnocr import CnOcr

# 9.import xlwt

怎麼透過Python實現批量資料擷取

分析上圖發現票據金額為“貳拾萬元整”，資料金額為大寫中文，因此在導入Excel之前我們需要將金額票據的資料轉換成數字的格式，基於此，我們需要先完成大寫漢字和數字的轉換。

def chineseNumber2Int(strNum: str):
    result = 0
    temp = 1  # 存放一个单位的数字如：十万
    count = 0  # 判断是否有chArr
    cnArr = [&#39;壹&#39;, &#39;贰&#39;, &#39;叁&#39;, &#39;肆&#39;, &#39;伍&#39;, &#39;陆&#39;, &#39;柒&#39;, &#39;捌&#39;, &#39;玖&#39;]
    chArr = [&#39;拾&#39;, &#39;佰&#39;, &#39;仟&#39;, &#39;万&#39;, &#39;亿&#39;]
    for i in range(len(strNum)):
        b = True
        c = strNum[i]
        for j in range(len(cnArr)):
            if c == cnArr[j]:
                if count != 0:
                    result += temp
                    count = 0
                temp = j + 1
                b = False
                break
        if b:
            for j in range(len(chArr)):
                if c == chArr[j]:
                    if j == 0:
                        temp *= 10
                    elif j == 1:
                        temp *= 100
                    elif j == 2:
                        temp *= 1000
                    elif j == 3:
                        temp *= 10000
                    elif j == 4:
                        temp *= 100000000
                count += 1
        if i == len(strNum) - 1:
            result += temp
    return result

透過上述程式碼即可實現大寫字母與數字的轉換，例如輸入“貳拾萬元整”即可導出“200000”，再將其轉換成數字後即可極大地簡化表格的操作，也可以在完成表格操作的同時有利於資料歸檔。

接下來，我們需要分析發票的內部內容，分析下圖可知，我們需要取得以下幾個資料內容：「出票日期」、「匯票到帳日期」、「票據號碼」、「收款人”、“票據金額”、“出票人”，可透過畫圖軟體取得精準定位。

怎麼透過Python實現批量資料擷取

如圖，小黑點即滑鼠所在地，畫圖軟體左下角即他的座標。

怎麼透過Python實現批量資料擷取

提取出票日期

def text1(new_img):
    #提取出票日期
    left = 80
    top = 143
    right = 162
    bottom = 162
    image_text1 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text1.show()
    txt1 = tool.image_to_string(image_text1)
    print(txt1)
    return str(txt1)

提取金額

def text2(new_img):
    #提取金额
    left = 224
    top = 355
    right = 585
    bottom = 380
    image_text2 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text2.show()
    image_text2.save("img/tmp.png")
    temp = ocr.ocr("img/tmp.png")
    temp="".join(temp[0])
    txt2=chineseNumber2Int(temp)
    print(txt2)
    return txt2

提取出票人

def text3(new_img):
    #提取出票人
    left = 177
    top = 207
    right = 506
    bottom = 231
    image_text3 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text3.show()
    image_text3.save("img/tmp.png")
    temp = ocr.ocr("img/tmp.png")
    txt3="".join(temp[0])
    print(txt3)
    return txt3

提取付款行

def text4(new_img):
    #提取付款行
    left = 177
    top = 274
    right = 492
    bottom = 311
    image_text4 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text4.show()
    image_text4.save("img/tmp.png")
    temp = ocr.ocr("img/tmp.png")
    txt4="".join(temp[0])
    print(txt4)
    return txt4

提取匯票到帳日期

def text5(new_img):
    #提取汇票到日期
    left = 92
    top = 166
    right = 176
    bottom = 184
    image_text5 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text5.show()
    txt5 = tool.image_to_string(image_text5)
    print(txt5)
    return txt5

提取票據單據

def text6(new_img):
    #提取票据号码
    left = 598
    top = 166
    right = 870
    bottom = 182
    image_text6 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text6.show()
    txt6 = tool.image_to_string(image_text6)
    print(txt6)
    return txt6

在將資料全部提取完成之後，即進入設定環節，我們需要先將所有帳單檔案進行提取，取得他們的檔案名稱和路徑。

ocr=CnOcr()
tool = pyocr.get_available_tools()[0]
filePath=&#39;img&#39;
img_name=[]
for i,j,name in os.walk(filePath):
    img_name=name

在取得完整後，即可進行資料導入Excel的操作。

count=1
book = xlwt.Workbook(encoding=&#39;utf-8&#39;,style_compression=0)
sheet = book.add_sheet(&#39;test&#39;,cell_overwrite_ok=True)
for i in img_name:
    img_url = filePath+"/"+i
    with open(img_url, &#39;rb&#39;) as f:
        a = f.read()
    new_img = PI.open(io.BytesIO(a))
    ## 写入csv
    col = (&#39;年份&#39;,&#39;出票日期&#39;,&#39;金额&#39;,&#39;出票人&#39;,&#39;付款行全称&#39;,&#39;汇票到日期&#39;,&#39;备注&#39;)
    for j in range(0,7):
        sheet.write(0,j,col[j])
    book.save(&#39;1.csv&#39;)
    shijian=text1(new_img)
    sheet.write(count,0,shijian[0:4])
    sheet.write(count,1,shijian[5:])
    sheet.write(count,2,text2(new_img))
    sheet.write(count,3,text3(new_img))
    sheet.write(count,4,text4(new_img))
    sheet.write(count,5,text5(new_img))
    sheet.write(count,6,text6(new_img))
    count = count + 1

至此，完整流程結束。

附上原始碼全部

from  wand.image import  Image
from PIL import Image as PI
import pyocr
import io
import re
import os
import shutil
import pyocr.builders
from cnocr import CnOcr
import requests
import xlrd
import xlwt
from openpyxl import load_workbook
 
def chineseNumber2Int(strNum: str):
    result = 0
    temp = 1  # 存放一个单位的数字如：十万
    count = 0  # 判断是否有chArr
    cnArr = [&#39;壹&#39;, &#39;贰&#39;, &#39;叁&#39;, &#39;肆&#39;, &#39;伍&#39;, &#39;陆&#39;, &#39;柒&#39;, &#39;捌&#39;, &#39;玖&#39;]
    chArr = [&#39;拾&#39;, &#39;佰&#39;, &#39;仟&#39;, &#39;万&#39;, &#39;亿&#39;]
    for i in range(len(strNum)):
        b = True
        c = strNum[i]
        for j in range(len(cnArr)):
            if c == cnArr[j]:
                if count != 0:
                    result += temp
                    count = 0
                temp = j + 1
                b = False
                break
        if b:
            for j in range(len(chArr)):
                if c == chArr[j]:
                    if j == 0:
                        temp *= 10
                    elif j == 1:
                        temp *= 100
                    elif j == 2:
                        temp *= 1000
                    elif j == 3:
                        temp *= 10000
                    elif j == 4:
                        temp *= 100000000
                count += 1
        if i == len(strNum) - 1:
            result += temp
    return result
 
 
def text1(new_img):
    #提取出票日期
 
    left = 80
    top = 143
    right = 162
    bottom = 162
    image_text1 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text1.show()
    txt1 = tool.image_to_string(image_text1)
 
    print(txt1)
    return str(txt1)
def text2(new_img):
    #提取金额
 
    left = 224
    top = 355
    right = 585
    bottom = 380
    image_text2 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text2.show()
    image_text2.save("img/tmp.png")
 
    temp = ocr.ocr("img/tmp.png")
 
    temp="".join(temp[0])
    txt2=chineseNumber2Int(temp)
    print(txt2)
 
    return txt2
 
def text3(new_img):
    #提取出票人
 
    left = 177
    top = 207
    right = 506
    bottom = 231
    image_text3 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text3.show()
    image_text3.save("img/tmp.png")
 
    temp = ocr.ocr("img/tmp.png")
    txt3="".join(temp[0])
 
    print(txt3)
    return txt3
def text4(new_img):
    #提取付款行
 
    left = 177
    top = 274
    right = 492
    bottom = 311
    image_text4 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text4.show()
    image_text4.save("img/tmp.png")
 
    temp = ocr.ocr("img/tmp.png")
    txt4="".join(temp[0])
 
    print(txt4)
    return txt4
def text5(new_img):
    #提取汇票到日期
 
    left = 92
    top = 166
    right = 176
    bottom = 184
    image_text5 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text5.show()
    txt5 = tool.image_to_string(image_text5)
 
    print(txt5)
    return txt5
def text6(new_img):
    #提取票据号码
 
    left = 598
    top = 166
    right = 870
    bottom = 182
    image_text6 = new_img.crop((left, top, right, bottom))
    #展示图片
    #image_text6.show()
    txt6 = tool.image_to_string(image_text6)
 
    print(txt6)
    return txt6
 
 
 
ocr=CnOcr()
 
tool = pyocr.get_available_tools()[0]
 
filePath='img'
img_name=[]
for i,j,name in os.walk(filePath):
    img_name=name
count=1
 
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('test',cell_overwrite_ok=True)
 
for i in img_name:
    img_url = filePath+"/"+i
    with open(img_url, 'rb') as f:
        a = f.read()
    new_img = PI.open(io.BytesIO(a))
    ## 写入csv
    col = ('年份','出票日期','金额','出票人','付款行全称','汇票到日期','备注')
    for j in range(0,7):
        sheet.write(0,j,col[j])
    book.save('1.csv')
    shijian=text1(new_img)
    sheet.write(count,0,shijian[0:4])
    sheet.write(count,1,shijian[5:])
    sheet.write(count,2,text2(new_img))
    sheet.write(count,3,text3(new_img))
    sheet.write(count,4,text4(new_img))
    sheet.write(count,5,text5(new_img))
    sheet.write(count,6,text6(new_img))
    count = count + 1

以上是怎麼透過Python實現批量資料擷取的詳細內容。更多資訊請關注PHP中文網其他相關文章！

Python ocr excel

陳述：

本文轉載於：yisu.com。如有侵權，請聯絡admin@php.cn刪除

上一篇：Python之debug調試的方法是什麼下一篇：Python之debug調試的方法是什麼

看更多