1
happykjoy OP 各位帮帮忙,我是小白
|
2
sunnyadamm 2018-09-10 10:35:22 +08:00
。。。你这代码贴的,估计大家连读都不想读。。。
|
3
happykjoy OP #!/usr/bin/python
#_*_ coding:utf8 _*_ import requests from requests.exceptions import RequestException import re from multiprocessing import Pool def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: #print(response.text) #print(response.encoding) #print(response.apparent_encoding) #r = response.text #print(requests.utils.get_encodings_from_content(r)[0]) #a = r.encode('utf-8').decode(requests.utils.get_encodings_from_content(r)[0]) #print(a) #print('------------------------------------') #b = r.encode('utf-8').decode(response.apparent_encoding) #print(b) return response.text return None except RequestException: return None def parse_one_page(html): pattern = re.compile('<li.*?cover.*?href="(.*?)" title="(.*?)">.*?img src="(.*?)"' '.*?author">(.*?)</div>.*?year">(.*?)</span>.*?publisher">(.*?)' '</span>.*?abstract">(.*?)</p>.*?</li>',re.S) result = re.findall(pattern,html) for item in result: yield { '地址': item[0], '书名': item[1], '封面': item[2], '作者': item[3].strip()[0:], '出版时间': item[4].strip()[0:], '出版社': item[5].strip()[0:], '详细': item[6].strip()[0:] } def main(): url = 'https://book.douban.com/' html=get_one_page(url) for item in parse_one_page(html): print(item) if __name__=='__main__': pool = Pool() pool.map(main, [i * 10 for i in range(10)]) pool.close() pool.join() |
4
happykjoy OP @sunnyadamm 抱歉,新手刚来,第一次贴代码,贴成纯文本格式了。
|