1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
| from urllib.parse import urlencode import requests from pyquery import PyQuery as pq from pymongo import MongoClient
base_url = 'https://m.weibo.cn/api/container/getIndex?' headers ={ 'Host':'m.weibo.cn', 'Referer':'https://m.weibo.cn/u/2830678474', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'X-Requested-With':'XMLHttpRequest' }
def get_page(page): params = { 'type': 'uid', 'value': '2830678474', 'containerid': '1076032830678474', 'page': page } url = base_url + urlencode(params) print(url) try: re = requests.get(url, headers=headers) if re.status_code == 200: htmls = re.json() return htmls except requests.ConnectionError as e: print('Error', e.args)
def parse_page(htmls): if htmls: items = htmls.get('data').get('cards') for item in items: item = item.get('mblog') weibo = {} weibo['id'] = item.get('id') weibo['text'] = pq(item.get('text')).text() weibo['attitudes'] = item.get('attitudes_count') weibo['comments'] = item.get('comments_count') weibo['reposts_count'] = item.get('reposts_count') yield weibo
def save_to_mongo(result): client = MongoClient(host='localhost', port=27017) db = client.admin collection = db.cweibo collection.insert_one(result)
if __name__ == '__main__': for page in range(2,11): htmls = get_page(page) results = parse_page(htmls) for result in results: save_to_mongo(result)
|