import requests
from pyquery import PyQuery as pq
def getHtml(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
r = requests.get(url,headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except requests.RequestException as e:
return e
def parseHtml(html):
doc = pq(html)
items = doc('.explore-tab.feed-item').items()
for item in items:
question = item.find('h2').text()
author = item.find('.author-link-line').text()
answer = pq(item.find('.content').html()).text()
with open('explore2.txt','a',encoding='utf-8') as f:
f.write('\n'.join([question,author,answer]))
f.write('\n' + '=' * 50 + '\n')
def main():
url = "https://www.zhihu.com/explore"
html =getHtml(url)
parseHtml(html)
main()