7-proxy-use

7. 代理的使用

代理的设置、代理池的维护、付费代理的使用、ADSL拨号代理等。
这里以 使用代理爬取微信公共号文章为例:

  1. config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
REDIS_HOST = 'localhost'

REDIS_PORT = 6379

REDIS_PASSWORD = '123456'

REDIS_KEY = 'weixin'

PROXY_POOL_URL = 'http://127.0.0.1:5555/random'

MYSQL_HOST = 'localhost'

MYSQL_PORT = 3306

MYSQL_USER = 'root'

MYSQL_PASSWORD = '123456'

MYSQL_DATABASE = 'weixin'

TIMEOUT = 10

MAX_FAILED_TIME = 20

VALID_STATUSES = [200]
  1. request.py
1
2
3
4
5
6
7
8
9
10
11
12
13

from config import *
from requests import Request


class WeixinRequest(Request):
def __init__(self,url,callback,method='GET',headers=None,need_proxy=False,fail_time=0,timeout=TIMEOUT):
#调用父类初始化函数,来完成部分参数的初始化
Request.__init__(self,method,url,headers)
self.callback = callback
self.need_proxy = need_proxy
self.fail_time = fail_time
self.timeout = timeout

3.db.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from redis import StrictRedis
from config import *
from pickle import dumps, loads
from request import WeixinRequest


class RedisQueue():
def __init__(self):
"""
初始化Redis
"""
self.db = StrictRedis(host=REDIS_HOST,port=REDIS_PORT,password=REDIS_PASSWORD)

def add(self,request):
"""
向队列添加序列化后的Request
:param request: 请求对象
:return: 添加结果
"""
if isinstance(request, WeixinRequest):
return self.db.rpush(REDIS_KEY, dumps(request))
return False

def pop(self):
"""
取出下一个Request并反序列化
:return: Request or None
"""
if self.db.llen(REDIS_KEY):
return loads(self.db.lpop(REDIS_KEY))
else:
return False

def clear(self):
self.db.delete(REDIS_KEY)

def empty(self):
return self.db.llen(REDIS_KEY) == 0

if __name__ == '__main__':
db = RedisQueue()
start_url = 'http://www.baidu.com'
weixin_request = WeixinRequest(url=start_url,callback='hello',need_proxy=True)
db.add(weixin_request)
request = db.pop()
print(request)
print(request.callback,request.need_proxy)
  1. mysql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pymysql
from config import *

class MySQL():
def __init__(self,host=MYSQL_HOST,username=MYSQL_USER,password=MYSQL_PASSWORD,
port=MYSQL_PORT,database=MYSQL_DATABASE):
#MySQL初始化
try:
self.db = pymysql.connect(host,password,database,charset='utf8',port=port)
self.cursor = self.db.cursor()
except pymysql.MySQLError as e:
print(e.args)

def insert(self,table,data):
"""
插入数据
"""

keys = ', '.join(data.keys())
values = ', '.join(['%s']*len(data))
sql_query = 'insert into %s (%s) values (%s)' % (table,keys,values)
try:
self.cursor.execute(sql_query,tuple(data.values()))
self.db.commit()
except pymysql.MySQLError as e:
print(e.args)
self.db.rollback()
  1. mysql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pymysql
from config import *

class MySQL():
def __init__(self,host=MYSQL_HOST,username=MYSQL_USER,password=MYSQL_PASSWORD,
port=MYSQL_PORT,database=MYSQL_DATABASE):
#MySQL初始化
try:
self.db = pymysql.connect(host,password,database,charset='utf8',port=port)
self.cursor = self.db.cursor()
except pymysql.MySQLError as e:
print(e.args)

def insert(self,table,data):
"""
插入数据
"""

keys = ', '.join(data.keys())
values = ', '.join(['%s']*len(data))
sql_query = 'insert into %s (%s) values (%s)' % (table,keys,values)
try:
self.cursor.execute(sql_query,tuple(data.values()))
self.db.commit()
except pymysql.MySQLError as e:
print(e.args)
self.db.rollback()
  1. spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from requests import Session
from config import *
from db import RedisQueue
from mysql import MySQL
from request import WeixinRequest
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
from requests import ReadTimeout,ConnectionError

class Spider():
base_url = 'http://weixin.sougou.com/weixin'
keyword = 'NBA'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'IPLOC=CN1100; SUID=6FEDCF3C541C940A000000005968CF55; SUV=1500041046435211; ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1; JSESSIONID=aaar_m7LEIW-jg_gikPZv; ld=Wkllllllll2BzGMVlllllVOo8cUlllll5G@HbZllll9lllllRklll5@@@@@@@@@@; LSTMV=212%2C350; LCLKINT=4650; ppinf=5|1500042908|1501252508|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8Y3J0OjEwOjE1MDAwNDI5MDh8cmVmbmljazo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=ppyIobo4mP_ZElYXXmRTeo2q9iFgeoQ87PshihQfB2nvgsCz4FdOf-kirUuntLHKTQbgRuXdwQWT6qW-CY_ax5VDgDEdeZR7I2eIDprve43ou5ZvR0tDBlqrPNJvC0yGhQ2dZI3RqOQ3y1VialHsFnmTiHTv7TWxjliTSZJI_Bc; sgid=27-27790591-AVlo1pzPiad6EVQdGDbmwnvM; PHPSESSID=mkp3erf0uqe9ugjg8os7v1e957; SUIR=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; sct=11; ppmdig=1500046378000000b7527c423df68abb627d67a0666fdcee; successCount=1|Fri, 14 Jul 2017 15:38:07 GMT',
'Host': 'weixin.sogou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
session = Session()
queue = RedisQueue()
mysql = MySQL()

def get_proxy(self):
"""
从代理池获取代理
:return:
"""
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
print('Get Proxy', response.text)
return response.text
return None
except requests.ConnectionError:
return None

def start(self):
"""
初始化工作
"""
#全局更新headers
self.session.headers.update(self.headers)
start_url = self.base_url + '?' +urlencode({'query':self.keyword,'type':2})
weixin_request = WeixinRequest(url=start_url,callback=self.parse_index,need_proxy=True)
#调度第一个请求
self.queue.add(weixin_request)

def parse_index(self, response):
"""
解析索引页
:param response: 响应
:return: 新的响应
"""
doc = pq(response.text)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
url = item.attr('href')
weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
yield weixin_request
next = doc('#sogou_next').attr('href')
if next:
url = self.base_url + str(next)
weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
yield weixin_request

def parse_detail(self, response):
"""
解析详情页
:param response: 响应
:return: 微信公众文章
"""
doc = pq(response.text)
data = {
'title': doc('.rich_media_title').text(),
'content': doc('.rich_media_content').text(),
'date': doc('#post-date').text(),
'nickname': doc('#js_profile_qrcode > div > strong').text(),
'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
}
yield data

def request(self, weixin_request):
"""
执行请求
:param weixin_request: 请求
:return: 响应
"""
try:
if weixin_request.need_proxy:
proxy = self.get_proxy()
if proxy:
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
# # ?????
return self.session.send(weixin_request.prepare(),
timeout=weixin_request.timeout, allow_redirects=False,proxies=proxies)
return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
except (ConnectionError,ReadTimeout) as e:
print(e.args)
return False

def error(self,weixin_request):
"""
错误处理
:param weixin_request: 请求
: return:
"""
weixin_request.fail_time = weixin_request.fail_time + 1
print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
if weixin_request.fail_time < MAX_FAILED_TIME:
self.queue.add(weixin_request)

def schedule(self):
"""
调度请求
:return:
"""
while self.queue.empty():
weixin_request = self.queue.pop()
callback = weixin_request.callback # ??????
print('Schedule', weixin_request.url)
response = self.request(weixin_request)
if response and response.status_code in VALID_STATUSES:
results = list(callback(response))
if results:
for result in results:
print('New Result', type(result))
if isinstance(result, WeixinRequest):
self.queue.add(result)
if isinstance(result, dict):
self.mysql.insert('articles', result)
else:
self.error(weixin_request)
else:
self.error(weixin_request)

def run(self):
"""
入口
:return:
"""

self.start()
self.schedule()

if __name__ == '__main__':
spider = Spider()
spider.run()
  1. run.py
1
2
3
4
5
from weixin.spider import Spider

if __name__ == '__main__':
spider = Spider()
spider.run()
分享到