import requests import re import json from multiprocessing import Pool
def get_one_page(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', } res = requests.get(url,headers=header) if res.status_code == 200: return res.text else: return None
def parse_html(html): html.replace('\n','') pattern = ('<i class="board-index.*?">(.?)</i>.?<img data-src="(.?)".?</a>.?<a href.?>(.?)</a>.?<p class="star"(.?)</p>.?releasetime">(.?)</p>.?integer">(.?)</i>.?fraction">(.*?)</i>') items = re.findall(pattern,html,re.S) for item in items: yield { 'index':item[0], 'img':item[1], 'title':item[2], 'direct':item[4], 'time':item[5], 'pinfen':item[6]+item[7] }
def write_to_file(content): with open('1.txt','a',encoding='utf-8')as f: f.write(json.dumps(content,ensure_ascii=False)+'\n') f.close()
def main(offset): url = ' html = get_one_page(url) for item in parse_html(html): write_to_file(item)
if name == 'main': pool = Pool.map(main,[i*10 for i in range(10)])