搜尋

首頁  >  問答  >  主體

Python用多進程寫檔案遇到編碼問題,用多執行緒卻不會

用多進程爬取資料寫入文件,運行沒有報錯,但是開啟文件卻亂碼。

##用多執行緒改寫時卻沒有這個問題,一切正常。
下面是資料寫入檔案的程式碼:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

<code>def Get_urls(start_page,end_page):

    print ' run task {} ({})'.format(start_page,os.getpid())

  

    url_text = codecs.open('url.txt','a','utf-8')

        for i in range(start_page,end_page+1):

            pageurl=baseurl1+str(i)+baseurl2+searchword

            response = requests.get(pageurl, headers=header)

            soup = BeautifulSoup(response.content, 'html.parser')

            a_list=soup.find_all('a')

            for a in a_list:

                if a.text!=''and 'wssd_content.jsp?bookid'in a['href']:

                    text=a.text.strip()

                    url=baseurl+str(a['href'])

                    url_text.write(text+'\t'+url+'\n')

        url_text.close()</code>

多進程用的進程池

1

2

3

4

5

6

7

8

9

10

11

<code>def Multiple_processes_test():

    t1 = time.time()

    print 'parent process {} '.format(os.getpid())

    page_ranges_list = [(1,3),(4,6),(7,9)]

    pool = multiprocessing.Pool(processes=3)

    for page_range in page_ranges_list:

        pool.apply_async(func=Get_urls,args=(page_range[0],page_range[1]))

    pool.close()

    pool.join()

    t2 = time.time()

    print '时间:',t2-t1</code>

我想大声告诉你我想大声告诉你2770 天前1164

全部回覆(3)我來回復

  • 巴扎黑

    巴扎黑2017-06-15 09:23:36

    圖片上已經說了,檔案以錯誤的編碼形式載入了,說明你多進程寫入的時候,編碼不是utf-8

    回覆
    0
  • 世界只因有你

    世界只因有你2017-06-15 09:23:36

    文件第一行新增:

    1

    <code>#coding: utf-8</code>

    回覆
    0
  • 我想大声告诉你

    我想大声告诉你2017-06-15 09:23:36

    開啟同一個檔案,相當危險,出錯機率相當大,
    多執行緒不出錯,極有可能是GIL,
    多進程沒有鎖,因此容易出錯了。

    1

    2

    <code>url_text = codecs.open('url.txt','a','utf-8')

    </code>

    建議改為生產者消費都模式!

    比如這樣

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    20

    21

    22

    23

    24

    25

    26

    27

    28

    29

    30

    31

    32

    33

    34

    35

    36

    37

    38

    39

    40

    41

    42

    43

    44

    45

    46

    47

    48

    49

    50

    51

    52

    53

    54

    55

    56

    57

    58

    59

    60

    61

    62

    63

    64

    65

    66

    67

    68

    69

    70

    71

    72

    73

    74

    75

    76

    77

    78

    79

    80

    81

    82

    83

    <code># -*- coding: utf-8 -* -

    import time

    import os

    import codecs

    import multiprocessing

    import requests

    from bs4 import BeautifulSoup

     

    baseurl = ''

    baseurl1 = ''

    baseurl2 = ''

    pageurl = ''

    searchword = ''

    header = {}

     

    def fake(url, **kwargs):

        class Response(object):

            pass

        o = Response()

        o.content = '<a href="/{}/wssd_content.jsp?bookid">foo</a>'.format(url)

        return o

     

    requests.get = fake

     

     

    def Get_urls(start_page, end_page, queue):

        print('run task {} ({})'.format(start_page, os.getpid()))

        try:

            for i in range(start_page, end_page + 1):

                pageurl = baseurl1 + str(i) + baseurl2 + searchword

                response = requests.get(pageurl, headers=header)

                soup = BeautifulSoup(response.content, 'html.parser')

                a_list = soup.find_all('a')

                for a in a_list:

                    if a.text != ''and 'wssd_content.jsp?bookid'in a['href']:

                        text = a.text.strip()

                        url = baseurl + str(a['href'])

                        queue.put(text + '\t' + url + '\n')

        except Exception as e:

            import traceback

            traceback.print_exc()

     

     

    def write_file(queue):

        print("start write file")

        url_text = codecs.open('url.txt', 'a', 'utf-8')

        while True:

            line = queue.get()

            if line is None:

                break

            print("write {}".format(line))

            url_text.write(line)

        url_text.close()

     

     

    def Multiple_processes_test():

        t1 = time.time()

        manager = multiprocessing.Manager()

        queue = manager.Queue()

        print 'parent process {} '.format(os.getpid())

        page_ranges_list = [(1, 3), (4, 6), (7, 9)]

        consumer = multiprocessing.Process(target=write_file, args=(queue,))

        consumer.start()

        pool = multiprocessing.Pool(processes=3)

        results = []

        for page_range in page_ranges_list:

            result = pool.apply_async(func=Get_urls,

                             args=(page_range[0],

                                   page_range[1],

                                   queue

                                ))

            results.append(result)

        pool.close()

        pool.join()

        queue.put(None)

        consumer.join()

        t2 = time.time()

        print '时间:', t2 - t1

     

     

    if __name__ == '__main__':

        Multiple_processes_test()

    </code>

    結果

    foo /4/wssd_content.jsp?bookid
    foo /5/wssd_content.jsp?bookid
    foo /6/wssd_content.jsp?bookid
    foo /1/wssd_content.jsp?bookid
    f /2/2/片
    foo /3/wssd_content.jsp?bookid
    foo /7/wssd_content.jsp?bookid
    foo /8/wssd_content.jsp?bookid
    foo /9/wssd_content.jsp?bookid

    回覆
    0
  • 取消回覆