##-coding:utf-8-##
import time
from bs4 import BeautifulSoup
import urllib2
import pymongo
import re
import datetime
def update():
Datas = {}
Connection = pymongo.Connection('192.168.1.2', 27017)
#Connect mongodb
db = connection.test_hq
#Create or connect test_hq library
For i in soup.find_all("div", class_="item"):
datas['_id'] = str(i.h2.a['href']).split('/')[-1].split('.')[0]
#Get the html page name as id number
datas['title'] = i.h2.get_text()
#Get title
url2 = i.h2.a['href']
#Get the title content url address
html2 = urllib2.urlopen(url2)
html_doc2 = html2.read()
soup2 = BeautifulSoup(html_doc2)
datas['content'] = soup2.find(attrs={"name":"description"})['content']
#Get article content
stock_name = []
stock_id = []
For name in re.findall(u"[u4e00-u9fa5]+",i.find(class_="stocks").get_text()):
stock_name.append(name)
#Get the name of the affected stock, and save the corresponding stock ID number in an array. Mongo supports array insertion
datas['stock_name'] = stock_name
For id in re.findall("d+",i.find(class_="stocks").get_text()):
stock_id.append(id)
#Get the impact stock id
datas['stock_id'] = stock_id
datas['update_time'] = datetime.datetime.strptime(re.search("w+.*w+", i.find(class_="fl date").span.get_text()).group(), '%Y -%m-%d %H:%M') - datetime.timedelta(hours=8)
#Get the release time and convert it to mongo time format
datas['onlooker'] = int(re.search("d+",i.find(class_="icons ic-wg").get_text()).group())
#Get the number of onlookers
db.test.save(datas)
#Insert into database
def get_data():
Title = str(soup.h2.a['href']).split('/')[-1].split('.')[0]
#Get the html page name for update judgment
With open('update.txt', 'r') as f:
Time = f.readline()
If title == time:
print 'currently no update', title
else:
With open('update.txt', 'w') as f:
f.write(title)
update()
while True:
If __name__ == '__main__':
url = 'http://www.ipython.me/qingbao/'
html = urllib2.urlopen(url)
html_doc = html.read()
Soup = BeautifulSoup(html_doc)
get_data()
Time.sleep(30)
#Refresh every 30 seconds
|