1.学习爬虫的基本库

1.Python爬虫基本库的使用

1.1 学习使用urllib库

urllib库是Python内置的HTTP请求库之一(还有httplib2、requests、treq等),包含以下4个模块:

  • request: 最基本的HTTP请求模块,用来模拟发送请求;
  • error: 异常处理模块;
  • parse: 一个工具模块,提供许多URL处理方法;
  • robotparser: 主要用来识别网址的robots.txt文件,判断网站是否可以爬取(其实用的比较少);
1.1.1 发送请求(request模块)

a. urlopen()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import urllib.request
# urlopen()的使用
url = 'https://www.python.org/'

# 返回结果是一个HTTPResponse对象
# 了解该对象的常用 方法和属性
root_htmls = urllib.request.urlopen(url)
print(root_htmls) #<http.client.HTTPResponse object at 0x02972030>
print(type(root_htmls)) #<class 'http.client.HTTPResponse'>
# 调用对象的read()方法,读取出来具体的内容
htmls = root_htmls.read().decode('utf-8') # 添加编码格式,输出结果看起来舒服
print(htmls)
# 求响应头的Server值
server_name = root_htmls.getheader('Server')
print(server_name) # 响应头的Server值: nginx

urlopen()的data参数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import urllib.request
import urllib.parse

# urlopen()的常用参数
url = 'http://httpbin.org/post'
dw = {'word':'hello'}
# 1. data参数
# 先用parse.urlencode()方法将参数字典转化为字符串
dw = urllib.parse.urlencode(dw)
print(dw) # word=hello
# 使用bytes()方法将dw转化为字节流(bytes)类型
data1 = bytes(dw, encoding = 'utf8')
res = urllib.request.urlopen(url, data= data1)
print(res.read())

# result:
g:\Python\Demon2\new_book\basic_lib>b2.py
word=hello
b'{"args":{},"data":"","files":{},"form":{"word":"hello"},"headers":{"Accept-Encoding":"identity","Connection":"close","Content-Length":"10","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"Python-urllib/3.6"},"json":null,"origin":"61.158.149.229","url":"http://httpbin.org/post"}\n'

urlopen()的timeout参数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import urllib.request
import urllib.error
import socket

# 2. timeout参数 设置超时时间,
# 请求超出时间还没得到响应,就会抛出异常
urlt = 'http://httpbin.org/get'

try:
troot_htmls = urllib.request.urlopen(urlt, timeout=0.01)
thtmls = troot_htmls.read()
print(troot_htmls)
# 判断URLError 是不是socket.timeout类型(超时异常)
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('TIME OUT')

b. Request

urlopen()只能实现最基本的请求的发起,如果要构建完整的请求,就需要利用更强大的Request类在请求中加入Headers等信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import urllib.request, parser

URL = 'http://httpbin.org/post'
data_ori = {'name':'Tom'}
data_pu = urllib.parse.urlencode(data_ori)
data = bytes(data_pu, encoding= 'utf8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Host': 'httpbin.org'}

# 将 URL构建为一个Request对象,进而更加丰富和灵活的配置参数
# 加入url、data、headers、method等常用参数
req = urllib.request.Request(URL, data= data, headers= headers,method= 'POST')
#也可以通过以下方法,直接添加 headers
# req2 = request.Request(URL, data=data,method='POST')
# req2.add_header(headers)
print(req)
root_htmls = urllib.request.urlopen(req)
htmls = root_htmls.read().decode('utf-8')
print(htmls)

#result:
g:\Python\Demon2\new_book\basic_lib>b4.py
<urllib.request.Request object at 0x01F02AD0>
{"args":{},"data":"","files":{},"form":{"name":"Tom"},"headers":{"Accept-Encoding":"identity","Connection":"close","Content-Length":"8","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0"},"json":null,"origin":"61.158.149.5","url":"http://httpbin.org/post"}

c. 高级用法:Cookies处理、代理设置、登录验证等

urllib.request模块里的BaseHandler类(各种处理器)可以完成这些功能,现列出其常用子类:

  • HTTPDefaultErrorHandler:处理HTTP响应错误,抛出HTTPError;
  • HTTPRedirectHandler: 用于处理重定向;
  • HTTPCookieProcessor: 用于处理Cookies;
  • ProxyHandler: 用于设置代理,默认代理为空;
  • HTTPPasswordMgr: 用于管理密码,维护了用户名和密码表;
  • HTTPBasicAuthHandler: 用于管理认证,若一个连接打开时需要认证,则可用其解决认证问题;
打开需要身份验证的网站
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener,OpenerDirector
from urllib.error import URLError

# 打开需要身份验证的网站

username = 'zhangsan'
pwd = '3'
url = 'http://localhost:5000/'

#构建一个HTTPPasswordMgrWithDefaultRealm对象
p = HTTPPasswordMgrWithDefaultRealm()
# 将设置好的url、账号、密码加进去
p.add_password(None, url, username, pwd)
#实例化HTTPBasicAuthHandler类,建立一个处理验证的Handler
auth_handler = HTTPBasicAuthHandler(p)
# 将Handler传过来,利用build_opener,构建一个Opener
opener = build_opener(auth_handler)

try:
# 获得验证后的页面源码内容
result = opener.open(url)
html = result.read().decode('utf-8')
print(html)
except URLError as e:
print(e.reason)
代理设置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from urllib.error import URLError
from urllib.request import ProxyHandler,build_opener

# 代理设置
url = 'https://www.baidu.com/'
# 本地搭建的代理,运行在9743端口
ph = {'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'}

proxy_handler = ProxyHandler(ph)
opener = build_opener(proxy_handler)

try:
res = opener.open(url)
htmls = res.read().decode('utf-8')
print(htmls)
except URLError as e:
print(e.reason)
Coockies处理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import http.cookiejar,urllib.request

#Coockies处理
url = 'https://www.baidu.com/'

#声明一个CookieJar对象
cookie = http.cookiejar.CookieJar()
#利用HTTPCookieProcessor构建一个handler对象
handler = urllib.request.HTTPCookieProcessor(cookie)
# 利用build_opener构建出opener, 再执行open()函数\方法
opener = urllib.request.build_opener(handler)
res = opener.open(url)

for item in cookie:
print(item.name + ' = ' +item.value)

print('===将Cookies以文本形式输出(Mozilla型浏览器格式)===')

#将Cookies以文本形式输出(Mozilla型浏览器的Cookies格式)
url2 = 'https://www.baidu.com/'
filename = 'cookies.txt'
#MozillaCookieJar是CookieJar的子类,
# 用来处理Cookies和文件相关的事情,比如读取和保存Cookies
# 将cookies保存成Mozilla型浏览器的Cookies格式
cookie2 = http.cookiejar.MozillaCookieJar(filename)
handler2 = urllib.request.HTTPCookieProcessor(cookie2)
opener2 = urllib.request.build_opener(handler2)
res2 = opener2.open(url2)
cookie2.save(ignore_discard=True, ignore_expires=True)

print('===将Cookies以文本形式输出(LWP格式)===')
#将Cookies以文本形式输出(LWP格式)
url3 = 'https://www.baidu.com/'
filename3 = '3cookies.txt'

cookie3 = http.cookiejar.LWPCookieJar(filename3)
handler3 = urllib.request.HTTPCookieProcessor(cookie3)
opener3 = urllib.request.build_opener(handler3)
res3 = opener3.open(url3)
cookie3.save(ignore_discard=True, ignore_expires=True)

print('===读取本地Cookies文件')

cookie_get = http.cookiejar.LWPCookieJar()
cookie_get.load(filename3,ignore_discard=True, ignore_expires=True)
handler4 = urllib.request.HTTPCookieProcessor(cookie_get)
opener4 = urllib.request.build_opener(handler4)
res4 = opener4.open(url3)
res4_r = res4.read().decode('utf-8')
print(res4_r)
1.1.2 处理异常(error模块)

防止网络不稳定不好的状况下,程序出现异常而停止运行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from urllib import request,error
import socket
#异常处理
url = 'https://cuiqingcai.com/index.htm'
try:
res = request.urlopen(url)
print(res.read().decode('utf-8'))
# HTTPError 是 URLError 的子类
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('timeout')
print(e.reason)
else:
print('request successfully')

# result:
g:\Python\Demon2\new_book\basic_lib>b8.py
Not Found
404
Server: nginx/1.10.3 (Ubuntu)
Date: Tue, 10 Jul 2018 05:58:21 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Vary: Cookie
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Cache-Control: no-cache, must-revalidate, max-age=0
Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"

1.1.3 解析连接(parse模块)

parse定义了处理URL的标准接口。

a. urlparse()

实现url的识别和分段,拆为6个部分:scheme、netloc、path、params、query、fragment

1
2
3
4
5
6
7
8
9
10
11
from urllib.parse import urlparse

url = 'https://www.baidu.com/index.html;user?id=5#comment'
result = urlparse(url,scheme='http',allow_fragments=True)
print(type(result)) #<class 'urllib.parse.ParseResult'>
print(result)
#ParseResult(scheme='https', netloc='www.baidu.com',
#path='/index.html', params='user', query='id=5', fragment='comment'
print(result.scheme, result.netloc, result.path, result.params,
result.query, result.fragment,sep = '\n')
print(result[0], result[1])# https www.baidu.com

b. urlunparse()

url组合,合并连接,接受的参数是一个可迭代对象(可以被遍历),且长度必须是6.

1
2
3
4
5
6
7
from urllib.parse import urlunparse

url_data = ['https','www.baidu.com','/index.html',
'user','id=5','comment']
url = urlunparse(url_data)
print(url)
#https://www.baidu.com/index.html;user?id=5#comment

c. urlsplit()

同urlparse()很像,只是拆解url时,少了params(合并到path中)这一部分

1
2
3
4
5
6
7
8
9
from urllib.parse  import urlsplit

url2 = 'https://www.baidu.com/index.html;user?id=5#comment'
data = urlsplit(url2)
print(data)
#SplitResult(scheme='https', netloc='www.baidu.com',
# path='/index.html;user', query='id=5', fragment='comment')
print(data.netloc,data[1],end='\n')
# www.baidu.com www.baidu.com

d. urlunsplit()

同urlunparse()很像,传入参数也必须是一个可迭代对象,区别是长度必须为5

1
2
3
4
5
6
7
from urllib.parse import urlunsplit

url_data2 = ['https','www.baidu.com','index.html',
'id=5','comment']
url3 = urlunsplit(url_data2)
print(url3)
# https://www.baidu.com/index.html?id=5#comment

e. urljoin()

也是将连接合并,接受两个参数,将第一个base_url(基础连接)作为第一个参数,新的连接作为第二个参数,该方法会分析base_url的scheme、netloc和path这三个内容,并对新连接缺失的内容进行补充,最后返回结果。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from urllib.parse import urljoin

base_url1 = 'https://www.baidu.com'
base_url2 = 'https://www.baidu.com/anb.html'
base_url3 = 'https://www.baidu.com?qwae=87'
base_url4 = 'www.baidu.com'
base_url5 = 'www.baidu.com#ojbk'
new_url1 = 'https://cuiqingcai.com/FS.htm'
new_url2 = '?category=2#sdf'

u1 = urljoin(base_url1, 'FS.html')
u2 = urljoin(base_url2, new_url1)
u3 = urljoin(base_url3, new_url1)
u4 = urljoin(base_url4, new_url2)
u5 = urljoin(base_url5, new_url2)

print(u1)#https://www.baidu.com/FS.html
print(u2) #https://cuiqingcai.com/FS.htm
print(u3)#https://cuiqingcai.com/FS.htm
print(u4) #www.baidu.com?category=2#sdf
print(u5)#www.baidu.com?category=2#sdf

可见,base_url 提供了3项内容scheme、netloc及path。当这3项在新连接里不存在时,就予以补充,新连接里有的话,就用新连接自己的;base_url中的params、query和fragment是不起作用的。

f. urlencode()

此方法在构造GET请求参数时非常有用,将字典类型的参数转化为GET请求的参数。

1
2
3
4
5
6
7
8
from urllib.parse import urlencode

query = {'name':'Jou','age':2}
query = urlencode(query)
print(query) # name=Jou&age=2
base_url = 'https://www.baidu.com?'
url = base_url + query
print(url) #https://www.baidu.com?name=Jou&age=2

g. parse_qs()与parse_qsl()

二者与urlencode()相反的功能:
1.parse_qs()将GET请求参数转化为字典类型;
2.parse_qsl()将GET请求参数转化为元组组成的列表;

1
2
3
4
5
6
7
8
from urllib.parse import parse_qs, parse_qsl

query2 = 'name=Jou&age=2'
data = parse_qs(query2)
print(data)# {'name': ['Jou'], 'age': ['2']}

data2 = parse_qsl(query2)
print(data2)# [('name', 'Jou'), ('age', '2')]

h.中文字符转URL编码与反转–qutoe()及unqutoe()

1
2
3
4
5
6
7
8
9
10
11
12
13
from urllib.parse import quote,unquote

keyword = '你好'
url = 'https://www.baidu.com?'
qk = quote(keyword)
print(qk) # %E4%BD%A0%E5%A5%BD
url = url + qk
print(url) # https://www.baidu.com?%E4%BD%A0%E5%A5%BD

kw = unquote(qk)
url_kw = unquote(url)
print(kw) # 你好
print(url_kw) # https://www.baidu.com?你好
1.1.4 Robots协议分析(robotparser模块)

也被称为爬虫协议、机器协议,全名叫网络爬虫排除标准,用来告诉爬虫和搜索引擎哪些页面可以爬取,哪些不可以爬取;
robotparser模块提供了一个类RobotFileParser,它可以根据某网站的robots.txt文件来判断一个爬虫是否有权限来爬取这个网页。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from urllib.robotparser import RobotFileParser
# 创建一个RobotFileParser对象
rb = RobotFileParser()
url = 'http://www.jianshu.com/robots.txt'
url_fetch1 = 'http://www.jianshu.com/p/b67554025d7d'
url_fetch='http://www.jianshu.com/search?q=python&page=1&type=collections'
url2 = 'https://www.jianshu.com/'
# 接收robots.txt连接参数,判断是否有权限爬取
rb.set_url(url)
#读取robots.txt文件并分析
rb.read()
rbool = rb.can_fetch('*', url2)
r1bool = rb.can_fetch('*', url_fetch1)
r2bool = rb.can_fetch('*', url_fetch)
print(rbool) #False
print(r1bool) #False
print(r2bool) #False

1.2 requests库的使用

1.2.1 基本用法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests

url = 'https://www.baidu.com/'
url_post = 'http://httpbin.org/post'
url_put = 'http://httpbin.org/put'
url_delete = 'http://httpbin.org/delete'
url_get = 'http://httpbin.org/get'
r1 = requests.post(url_post)
r2 = requests.put(url_put)
r3 = requests.delete(url_delete)
r4 = requests.head(url_get)
r5 = requests.options(url_get)

r = requests.get(url)
print(type(r))
print(r.status_code)
print(type(r.text))
print(r.text)
print(r.cookies)

GET()请求

1
2
3
4
5
6
7
8
9
10
11
import requests
url = 'http://httpbin.org/get'
data = {'name':'Jim','age':18}
r = requests.get(url, data)
print(r.text)

# r2 = requests.get(url)
# print(type(r2))
# print(type(r2.text))
print(r.json())
print(type(r.json()))

知乎问题抓取:

1
2
3
4
5
6
7
8
9
10
11
12
import requests
import re
#爬取知乎问题

url = 'https://www.zhihu.com/explore'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
r = requests.get(url,headers= headers)
# print(r.text)
pattern = re.compile('explore-feed.*?question_link.*?>(.*?)</a>',re.S)
titles = re.findall(pattern,r.text)
print(titles)

github图标抓取

1
2
3
4
5
6
7
8
9
10
11
import requests
import re

# 抓取github图标
url = 'https://github.com/favicon.ico'

r = requests.get(url)
# print(r.text)
# print(r.content)
with open('github.pic','wb') as f:
f.write(r.content)
1.2.2 高级用法

a. 文件上传

1
2
3
4
5
6
7
8
9
import requests

# 文件上传

url = 'http://httpbin.org/post'

files = {'file':open('github.pic','rb')}
r = requests.post(url, files=files)
print(r.text)

b. Cookies

使用requests来使用设置Cookies都很简洁:
1.获取Cookie

1
2
3
4
5
6
7
8
9
10
11
12
13
import requests

url = 'https://www.baidu.com/'
r = requests.get(url)
cookies_baidu = r.cookies
print(cookies_baidu)
#<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>

#利用items()方法将其转化为元组组成的列表,实现Cookie的遍历解析
print(cookies_baidu.items()) # [('BDORZ', '27315')]
for key,value in cookies_baidu.items():
print(key+':'+value)
#BDORZ:27315

  1. 利用Cookies维持登录状态
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    import requests

    # 利用Cookies维持知乎登录状态

    url = 'https://www.zhihu.com/'
    headers = {
    'Cookies': '_zap=c74ffe60-fc10-4d88-8507-8b2f094ffe47; \
    z_c0="2|1:0|10:1524489650|4:z_c0|92:Mi4xQU5UbkFnQUFBQUFB\
    QUtDRUdyNThEU1lBQUFCZ0FsVk5zaXZMV3dBR1VjamtZZ3NWWVZBbjZEUV\
    JFbElrTGpoOGZn|9ccd196c7209f86d929e45a1390e95ba02dc65699bce56\
    f85c4725dec94e1365"; d_c0="ABBh7ahaiA2PTpF7sbd2mFngNGhU8hYDlnk=|1525268870";\
    _xsrf=783a0476-0ab1-43a0-8d65-c5762532455d; q_c1=14b00bbf304245c79d6eb10cdb7\
    195f6|1530178859000|1524489635000; __utmc=518543923; __utmv=51854390.100--|2=re\
    gistration_date=20160419=1^3=entry_date=20160419=1; tgw_l7_route=69f52e0ac392bb4\
    3ffb22fc18a173ee6; __utma=51854390.1263074800.1531224527.1531276507.1531281798.3; \
    __utmb=51854390.0.10.1531281798; __utmz=51854390.1531281798.3.3.utmcsr=zhihu.com|u\
    tmccn=(referral)|utmcmd=referral|utmcct=/',
    'Host': 'www.zhihu.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/67.0.3396.99 Safari/537.36'

    }
    r = requests.get(url, headers= headers)
    print(r.text)

c. 会话维持(Session)

Session模拟在一个浏览器中打开同一个站点的不同页面,模拟登录成功后接下来的一些操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import requests

#Session 会话维持

url_set = 'http://httpbin.org/cookies/set/number/1234'
url_get = 'http://httpbin.org/cookies'
s = requests.Session()

# 请求此网站时,设置一个cookie,名字是number,值是1234
rs = s.get(url_set)
print(rs.text)

#请求此网址获取当前的Cookies
r = s.get(url_get)
print(r.text)

d. SSL证书验证

爬取猫眼电影TOP100

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
""" 
爬取猫眼电影TOP100
电影名称、时间、评分、图片 等信息
"""
import requests
import re
import json
import time

class MaoYanSpider():

url = 'http://maoyan.com/board/4?offset='

title_pattern = '<dd>.*?name.*?a.*?>(.*?)</a>'
rank_pattern = '<dd>.*?board-index.*?>(.*?)</i>'
pic_pattern = '<dd>.*?data-src="(.*?)@160w'
actor_pattern = '<dd>.*?star.*?>(.*?)</p>'
time_pattern = '<dd>.*?releasetime">(.*?)</p>'
score_pattern = '<dd>.*?score.*?i.*?>(.*?)</i>.*?i.*?>(.*?)</i>'

def get_one_page(self):

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response = requests.get(MaoYanSpider.url, headers= headers)
if response.status_code == 200:
return response.text
return None

def refine(self,root_htmls):
titles = re.findall(MaoYanSpider.title_pattern, root_htmls, re.S)
rank = re.findall(MaoYanSpider.rank_pattern, root_htmls, re.S)
pics = re.findall(MaoYanSpider.pic_pattern, root_htmls, re.S)
actors = re.findall(MaoYanSpider.actor_pattern, root_htmls, re.S)
actors_new = []
for actor in actors:
actors_new.append(actor.strip())
time = re.findall(MaoYanSpider.time_pattern, root_htmls, re.S)
score = re.findall(MaoYanSpider.score_pattern, root_htmls, re.S)
score_new = []
for sc in score:
score_new.append(sc[0]+sc[1])

info_movie = []
for index in range(0,len(titles)):
info_movie.append({
'Title':titles[index],'Rank':rank[index],'Actor':actors_new[index],
'Time':time[index],'Score':score_new[index],'Img':pics[index]
})

return info_movie

def write_to_file(self,info_movie):

with open('movie_rank_info.txt','a',encoding='utf-8') as f:
contents = json.dumps(info_movie,ensure_ascii=False)
f.write(contents+'\n')

def main(self):

if __name__ == '__main__':
number = [0,10,20,30,40,50,60,70,80,90]
for i in number:
offset = i
time.sleep(1)
MaoYanSpider.url = MaoYanSpider.url + str(offset)
print(MaoYanSpider.url)
root_htmls = self.get_one_page()
info_movie = self.refine(root_htmls)
self.write_to_file(info_movie)
MaoYanSpider.url = 'http://maoyan.com/board/4?offset='


maoyan = MaoYanSpider()
maoyan.main()
分享到