from html.parser import HTMLParser import urllib.request with urllib.request.urlopen('https://www.python.org/events/python-events/') as url: content = url.read() html = content.decode('utf-8') class MyHtmlParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.location = [] self.flag = False def handle_starttag(self, tag, attrs): if tag == 'span' and 'event-location' in dict(attrs)['class']: self.flag = True def handle_data(self, data): if self.flag: self.location.append(data) self.flag = False parser = MyHtmlParser() parser.feed(html) for i in parser.location: print(i)
运行结果:
Traceback (most recent call last): File "C:/Users/user/PycharmProjects/untitled1/python6.py", line 22, in <module> parser.feed(html) File "C:\Users\user\AppData\Local\Programs\Python\Python35-32\lib\html\parser.py", line 111, in feed self.goahead(0) File "C:\Users\user\AppData\Local\Programs\Python\Python35-32\lib\html\parser.py", line 171, in goahead k = self.parse_starttag(i) File "C:\Users\user\AppData\Local\Programs\Python\Python35-32\lib\html\parser.py", line 345, in parse_starttag self.handle_starttag(tag, attrs) File "C:/Users/user/PycharmProjects/untitled1/python6.py", line 14, in handle_starttag if tag == 'span' and 'event-location' in dict(attrs)['class']: KeyError: 'class'
为什么会有KeyError呢?这是爬取的目标网站的一段代码:
<span class="event-location">Capital One McLean Conference Center in McLean, VA, USA</span>
没有大小写错误,怎么会有KeyError呢?
另外我爬取这一段代码:
<a href="/events/python-events/461/">PyConES 2016 - Almeria</a>
把代码改成if tag == 'a' and 'python-event' in dict(attrs)['href']:
就可以成功爬取。。。
所以是什么问题呢?
是class有禁忌????
三叔2016-10-25 17:10:37
如果某个span没有 class属性,dict就报错~
if tag == 'span' and 'event-location' in dict(attrs)['class']: self.flag = True
改成这样:
if tag == 'span' and 'class' in dict(attrs) and 'event-location' in dict(attrs)['class']: self.flag = True