1.学习爬虫的基本库

2018-07-09

网络爬虫

1.Python爬虫基本库的使用

1.1 学习使用urllib库

urllib库是Python内置的HTTP请求库之一(还有httplib2、requests、treq等)，包含以下4个模块：

request: 最基本的HTTP请求模块，用来模拟发送请求；
error: 异常处理模块；
parse: 一个工具模块，提供许多URL处理方法；
robotparser: 主要用来识别网址的robots.txt文件，判断网站是否可以爬取(其实用的比较少)；

1.1.1 发送请求(request模块)

a. urlopen()

import urllib.request
# urlopen()的使用
url = 'https://www.python.org/'

# 返回结果是一个HTTPResponse对象
# 了解该对象的常用 方法和属性
root_htmls = urllib.request.urlopen(url)
print(root_htmls) #<http.client.HTTPResponse object at 0x02972030>
print(type(root_htmls)) #<class 'http.client.HTTPResponse'> 
# 调用对象的read()方法，读取出来具体的内容
htmls = root_htmls.read().decode('utf-8') # 添加编码格式，输出结果看起来舒服
print(htmls) 
# 求响应头的Server值
server_name = root_htmls.getheader('Server')
print(server_name) # 响应头的Server值: nginx

urlopen()的data参数：

import urllib.request
import urllib.parse

# urlopen()的常用参数
url = 'http://httpbin.org/post'
dw = {'word':'hello'}
# 1. data参数
# 先用parse.urlencode()方法将参数字典转化为字符串
dw = urllib.parse.urlencode(dw)
print(dw) # word=hello
# 使用bytes()方法将dw转化为字节流(bytes)类型
data1 = bytes(dw, encoding = 'utf8')
res = urllib.request.urlopen(url, data= data1)
print(res.read())

# result:
g:\Python\Demon2\new_book\basic_lib>b2.py
word=hello
b'{"args":{},"data":"","files":{},"form":{"word":"hello"},"headers":{"Accept-Encoding":"identity","Connection":"close","Content-Length":"10","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"Python-urllib/3.6"},"json":null,"origin":"61.158.149.229","url":"http://httpbin.org/post"}\n'

urlopen()的timeout参数：

import urllib.request
import urllib.error
import socket

# 2. timeout参数 设置超时时间，
# 请求超出时间还没得到响应，就会抛出异常
urlt = 'http://httpbin.org/get'

try:
    troot_htmls = urllib.request.urlopen(urlt, timeout=0.01)
    thtmls = troot_htmls.read()
    print(troot_htmls)
# 判断URLError 是不是socket.timeout类型(超时异常)    
except urllib.error.URLError  as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

b. Request

urlopen()只能实现最基本的请求的发起，如果要构建完整的请求，就需要利用更强大的Request类在请求中加入Headers等信息。

import urllib.request, parser

URL = 'http://httpbin.org/post'
data_ori = {'name':'Tom'}
data_pu = urllib.parse.urlencode(data_ori)
data = bytes(data_pu, encoding= 'utf8')
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
    'Host': 'httpbin.org'}

# 将 URL构建为一个Request对象，进而更加丰富和灵活的配置参数
# 加入url、data、headers、method等常用参数
req = urllib.request.Request(URL, data= data, headers= headers,method= 'POST')
#也可以通过以下方法，直接添加 headers
# req2 = request.Request(URL, data=data,method='POST')
# req2.add_header(headers)
print(req)
root_htmls = urllib.request.urlopen(req)
htmls = root_htmls.read().decode('utf-8')
print(htmls)

#result：
g:\Python\Demon2\new_book\basic_lib>b4.py
<urllib.request.Request object at 0x01F02AD0>
{"args":{},"data":"","files":{},"form":{"name":"Tom"},"headers":{"Accept-Encoding":"identity","Connection":"close","Content-Length":"8","Content-Type":"application/x-www-form-urlencoded","Host":"httpbin.org","User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0"},"json":null,"origin":"61.158.149.5","url":"http://httpbin.org/post"}

c. 高级用法：Cookies处理、代理设置、登录验证等

urllib.request模块里的BaseHandler类(各种处理器)可以完成这些功能，现列出其常用子类：

HTTPDefaultErrorHandler:处理HTTP响应错误，抛出HTTPError;
HTTPRedirectHandler: 用于处理重定向;
HTTPCookieProcessor: 用于处理Cookies;
ProxyHandler: 用于设置代理，默认代理为空;
HTTPPasswordMgr: 用于管理密码，维护了用户名和密码表;
HTTPBasicAuthHandler: 用于管理认证，若一个连接打开时需要认证，则可用其解决认证问题;

打开需要身份验证的网站

from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener,OpenerDirector
from urllib.error import URLError

# 打开需要身份验证的网站

username = 'zhangsan'
pwd = '3'
url = 'http://localhost:5000/'

#构建一个HTTPPasswordMgrWithDefaultRealm对象
p = HTTPPasswordMgrWithDefaultRealm()
# 将设置好的url、账号、密码加进去
p.add_password(None, url, username, pwd)
#实例化HTTPBasicAuthHandler类，建立一个处理验证的Handler
auth_handler = HTTPBasicAuthHandler(p)
# 将Handler传过来，利用build_opener，构建一个Opener
opener = build_opener(auth_handler)

try:
    # 获得验证后的页面源码内容
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

代理设置

from urllib.error import URLError
from urllib.request import ProxyHandler,build_opener

# 代理设置
url = 'https://www.baidu.com/'
# 本地搭建的代理，运行在9743端口
ph = {'http':'http://127.0.0.1:9743',
      'https':'https://127.0.0.1:9743'}

proxy_handler = ProxyHandler(ph)
opener = build_opener(proxy_handler)

try:
    res = opener.open(url)
    htmls = res.read().decode('utf-8')
    print(htmls)
except URLError as e:
    print(e.reason)

Coockies处理

import http.cookiejar,urllib.request

#Coockies处理
url = 'https://www.baidu.com/'

#声明一个CookieJar对象
cookie = http.cookiejar.CookieJar()
#利用HTTPCookieProcessor构建一个handler对象
handler = urllib.request.HTTPCookieProcessor(cookie)
# 利用build_opener构建出opener, 再执行open()函数\方法
opener = urllib.request.build_opener(handler)
res = opener.open(url)

for item in cookie:
    print(item.name + ' = ' +item.value)

print('===将Cookies以文本形式输出(Mozilla型浏览器格式)===')

#将Cookies以文本形式输出(Mozilla型浏览器的Cookies格式)
url2 = 'https://www.baidu.com/'
filename = 'cookies.txt'
#MozillaCookieJar是CookieJar的子类，
# 用来处理Cookies和文件相关的事情，比如读取和保存Cookies
# 将cookies保存成Mozilla型浏览器的Cookies格式
cookie2 = http.cookiejar.MozillaCookieJar(filename)
handler2 = urllib.request.HTTPCookieProcessor(cookie2)
opener2 = urllib.request.build_opener(handler2)
res2 = opener2.open(url2)
cookie2.save(ignore_discard=True, ignore_expires=True)

print('===将Cookies以文本形式输出(LWP格式)===')
#将Cookies以文本形式输出(LWP格式)
url3 = 'https://www.baidu.com/'
filename3 = '3cookies.txt'

cookie3 = http.cookiejar.LWPCookieJar(filename3)
handler3 = urllib.request.HTTPCookieProcessor(cookie3)
opener3 = urllib.request.build_opener(handler3)
res3 = opener3.open(url3)
cookie3.save(ignore_discard=True, ignore_expires=True)

print('===读取本地Cookies文件')

cookie_get = http.cookiejar.LWPCookieJar()
cookie_get.load(filename3,ignore_discard=True, ignore_expires=True)
handler4 = urllib.request.HTTPCookieProcessor(cookie_get)
opener4 = urllib.request.build_opener(handler4)
res4 = opener4.open(url3)
res4_r = res4.read().decode('utf-8')
print(res4_r)

1.1.2 处理异常(error模块)

防止网络不稳定不好的状况下，程序出现异常而停止运行。

from urllib import request,error
import socket
#异常处理
url = 'https://cuiqingcai.com/index.htm'
try:
    res = request.urlopen(url)
    print(res.read().decode('utf-8'))
# HTTPError 是 URLError 的子类
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    if isinstance(e.reason,socket.timeout):
        print('timeout')
    print(e.reason)
else:
    print('request successfully')
    
# result:
g:\Python\Demon2\new_book\basic_lib>b8.py
Not Found
404
Server: nginx/1.10.3 (Ubuntu)
Date: Tue, 10 Jul 2018 05:58:21 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Vary: Cookie
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Cache-Control: no-cache, must-revalidate, max-age=0
Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"

1.1.3 解析连接(parse模块)

parse定义了处理URL的标准接口。

a. urlparse()

实现url的识别和分段,拆为6个部分:scheme、netloc、path、params、query、fragment

from urllib.parse import urlparse

url = 'https://www.baidu.com/index.html;user?id=5#comment'
result = urlparse(url,scheme='http',allow_fragments=True)
print(type(result)) #<class 'urllib.parse.ParseResult'>
print(result) 
#ParseResult(scheme='https', netloc='www.baidu.com', 
#path='/index.html', params='user', query='id=5', fragment='comment'
print(result.scheme, result.netloc, result.path, result.params, 
      result.query, result.fragment,sep = '\n')
print(result[0], result[1])# https www.baidu.com

b. urlunparse()

url组合，合并连接，接受的参数是一个可迭代对象(可以被遍历)，且长度必须是6.

from urllib.parse import urlunparse

url_data = ['https','www.baidu.com','/index.html',
             'user','id=5','comment']
url = urlunparse(url_data)
print(url)
#https://www.baidu.com/index.html;user?id=5#comment

c. urlsplit()

同urlparse()很像，只是拆解url时，少了params(合并到path中)这一部分

from urllib.parse  import urlsplit

url2 = 'https://www.baidu.com/index.html;user?id=5#comment'
data = urlsplit(url2)
print(data)
#SplitResult(scheme='https', netloc='www.baidu.com', 
# path='/index.html;user', query='id=5', fragment='comment')
print(data.netloc,data[1],end='\n')
# www.baidu.com www.baidu.com

d. urlunsplit()

同urlunparse()很像，传入参数也必须是一个可迭代对象，区别是长度必须为5

from urllib.parse import urlunsplit

url_data2 = ['https','www.baidu.com','index.html',
             'id=5','comment']
url3 = urlunsplit(url_data2)
print(url3)
# https://www.baidu.com/index.html?id=5#comment

e. urljoin()

也是将连接合并，接受两个参数，将第一个base_url(基础连接)作为第一个参数，新的连接作为第二个参数，该方法会分析base_url的scheme、netloc和path这三个内容，并对新连接缺失的内容进行补充，最后返回结果。

from urllib.parse import urljoin

base_url1 = 'https://www.baidu.com'
base_url2 = 'https://www.baidu.com/anb.html'
base_url3 = 'https://www.baidu.com?qwae=87'
base_url4 = 'www.baidu.com'
base_url5 = 'www.baidu.com#ojbk'
new_url1 = 'https://cuiqingcai.com/FS.htm'
new_url2 = '?category=2#sdf'

u1 = urljoin(base_url1, 'FS.html')
u2 = urljoin(base_url2, new_url1)
u3 = urljoin(base_url3, new_url1)
u4 = urljoin(base_url4, new_url2)
u5 = urljoin(base_url5, new_url2)

print(u1)#https://www.baidu.com/FS.html
print(u2) #https://cuiqingcai.com/FS.htm
print(u3)#https://cuiqingcai.com/FS.htm
print(u4) #www.baidu.com?category=2#sdf
print(u5)#www.baidu.com?category=2#sdf

可见，base_url 提供了3项内容scheme、netloc及path。当这3项在新连接里不存在时，就予以补充，新连接里有的话，就用新连接自己的；base_url中的params、query和fragment是不起作用的。

f. urlencode()

此方法在构造GET请求参数时非常有用,将字典类型的参数转化为GET请求的参数。

from urllib.parse import urlencode

query = {'name':'Jou','age':2}
query = urlencode(query)
print(query) # name=Jou&age=2
base_url = 'https://www.baidu.com?'
url = base_url + query
print(url) #https://www.baidu.com?name=Jou&age=2

g. parse_qs()与parse_qsl()

二者与urlencode()相反的功能：
1.parse_qs()将GET请求参数转化为字典类型;
2.parse_qsl()将GET请求参数转化为元组组成的列表;

from urllib.parse import parse_qs, parse_qsl

query2 = 'name=Jou&age=2'
data = parse_qs(query2)
print(data)# {'name': ['Jou'], 'age': ['2']}

data2 = parse_qsl(query2)
print(data2)# [('name', 'Jou'), ('age', '2')]

h.中文字符转URL编码与反转–qutoe()及unqutoe()

from urllib.parse import quote,unquote

keyword = '你好'
url = 'https://www.baidu.com?'
qk = quote(keyword) 
print(qk) # %E4%BD%A0%E5%A5%BD
url = url + qk 
print(url) # https://www.baidu.com?%E4%BD%A0%E5%A5%BD

kw = unquote(qk)
url_kw = unquote(url)
print(kw) # 你好
print(url_kw) # https://www.baidu.com?你好

1.1.4 Robots协议分析(robotparser模块)

也被称为爬虫协议、机器协议，全名叫网络爬虫排除标准，用来告诉爬虫和搜索引擎哪些页面可以爬取，哪些不可以爬取；
robotparser模块提供了一个类RobotFileParser,它可以根据某网站的robots.txt文件来判断一个爬虫是否有权限来爬取这个网页。

from urllib.robotparser import RobotFileParser
# 创建一个RobotFileParser对象
rb = RobotFileParser()
url = 'http://www.jianshu.com/robots.txt'
url_fetch1 = 'http://www.jianshu.com/p/b67554025d7d'
url_fetch='http://www.jianshu.com/search?q=python&page=1&type=collections'
url2 = 'https://www.jianshu.com/'
# 接收robots.txt连接参数，判断是否有权限爬取
rb.set_url(url)
#读取robots.txt文件并分析
rb.read()
rbool = rb.can_fetch('*', url2)
r1bool = rb.can_fetch('*', url_fetch1)
r2bool = rb.can_fetch('*', url_fetch)
print(rbool) #False
print(r1bool) #False
print(r2bool) #False

1.2 requests库的使用

1.2.1 基本用法

import requests

url = 'https://www.baidu.com/'
url_post = 'http://httpbin.org/post'
url_put = 'http://httpbin.org/put'
url_delete = 'http://httpbin.org/delete'
url_get = 'http://httpbin.org/get'
r1 = requests.post(url_post)
r2 = requests.put(url_put)
r3 = requests.delete(url_delete)
r4 = requests.head(url_get)
r5 = requests.options(url_get)

r = requests.get(url)
print(type(r))
print(r.status_code)
print(type(r.text))
print(r.text)
print(r.cookies)

GET()请求

import requests
url = 'http://httpbin.org/get'
data = {'name':'Jim','age':18}
r = requests.get(url, data)
print(r.text)

# r2 = requests.get(url)
# print(type(r2))
# print(type(r2.text))
print(r.json())
print(type(r.json()))

知乎问题抓取：

import requests
import re
#爬取知乎问题

url = 'https://www.zhihu.com/explore'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
r = requests.get(url,headers= headers)
# print(r.text)
pattern = re.compile('explore-feed.*?question_link.*?>(.*?)</a>',re.S)
titles = re.findall(pattern,r.text)
print(titles)

github图标抓取

import requests
import re

# 抓取github图标
url = 'https://github.com/favicon.ico'

r = requests.get(url)
# print(r.text)
# print(r.content)
with open('github.pic','wb') as f:
    f.write(r.content)

1.2.2 高级用法

a. 文件上传

import requests

# 文件上传

url = 'http://httpbin.org/post'

files = {'file':open('github.pic','rb')}
r = requests.post(url, files=files)
print(r.text)

b. Cookies

使用requests来使用设置Cookies都很简洁：
1.获取Cookie

import requests

url = 'https://www.baidu.com/'
r = requests.get(url)
cookies_baidu = r.cookies
print(cookies_baidu) 
#<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>

#利用items()方法将其转化为元组组成的列表,实现Cookie的遍历解析
print(cookies_baidu.items()) # [('BDORZ', '27315')]
for key,value in cookies_baidu.items():
    print(key+':'+value)
#BDORZ:27315

利用Cookies维持登录状态

import requests

# 利用Cookies维持知乎登录状态

url = 'https://www.zhihu.com/'
headers = { 
    'Cookies': '_zap=c74ffe60-fc10-4d88-8507-8b2f094ffe47; \
    z_c0="2|1:0|10:1524489650|4:z_c0|92:Mi4xQU5UbkFnQUFBQUFB\
    QUtDRUdyNThEU1lBQUFCZ0FsVk5zaXZMV3dBR1VjamtZZ3NWWVZBbjZEUV\
    JFbElrTGpoOGZn|9ccd196c7209f86d929e45a1390e95ba02dc65699bce56\
    f85c4725dec94e1365"; d_c0="ABBh7ahaiA2PTpF7sbd2mFngNGhU8hYDlnk=|1525268870";\
     _xsrf=783a0476-0ab1-43a0-8d65-c5762532455d; q_c1=14b00bbf304245c79d6eb10cdb7\
     195f6|1530178859000|1524489635000; __utmc=518543923; __utmv=51854390.100--|2=re\
     gistration_date=20160419=1^3=entry_date=20160419=1; tgw_l7_route=69f52e0ac392bb4\
     3ffb22fc18a173ee6; __utma=51854390.1263074800.1531224527.1531276507.1531281798.3; \
     __utmb=51854390.0.10.1531281798; __utmz=51854390.1531281798.3.3.utmcsr=zhihu.com|u\
     tmccn=(referral)|utmcmd=referral|utmcct=/',
    'Host': 'www.zhihu.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/67.0.3396.99 Safari/537.36' 

}
r = requests.get(url, headers= headers) 
print(r.text)

c. 会话维持(Session)

Session模拟在一个浏览器中打开同一个站点的不同页面，模拟登录成功后接下来的一些操作。

import requests

#Session 会话维持

url_set = 'http://httpbin.org/cookies/set/number/1234'
url_get = 'http://httpbin.org/cookies'
s = requests.Session()

# 请求此网站时，设置一个cookie,名字是number,值是1234
rs = s.get(url_set)
print(rs.text)

#请求此网址获取当前的Cookies
r = s.get(url_get)
print(r.text)

d. SSL证书验证

…

爬取猫眼电影TOP100

""" 
爬取猫眼电影TOP100 
电影名称、时间、评分、图片 等信息
"""
import requests
import re
import json
import time

class MaoYanSpider():

    url = 'http://maoyan.com/board/4?offset='

    title_pattern = '<dd>.*?name.*?a.*?>(.*?)</a>'
    rank_pattern = '<dd>.*?board-index.*?>(.*?)</i>'
    pic_pattern = '<dd>.*?data-src="(.*?)@160w'
    actor_pattern = '<dd>.*?star.*?>(.*?)</p>'
    time_pattern = '<dd>.*?releasetime">(.*?)</p>'
    score_pattern = '<dd>.*?score.*?i.*?>(.*?)</i>.*?i.*?>(.*?)</i>'

    def get_one_page(self):

        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit\
            /537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        response = requests.get(MaoYanSpider.url, headers= headers)
        if response.status_code == 200:
            return response.text
        return None

    def refine(self,root_htmls):
        titles = re.findall(MaoYanSpider.title_pattern, root_htmls, re.S)
        rank = re.findall(MaoYanSpider.rank_pattern, root_htmls, re.S)
        pics = re.findall(MaoYanSpider.pic_pattern, root_htmls, re.S)
        actors = re.findall(MaoYanSpider.actor_pattern, root_htmls, re.S)
        actors_new = []
        for actor in actors:
            actors_new.append(actor.strip())
        time = re.findall(MaoYanSpider.time_pattern, root_htmls, re.S)
        score = re.findall(MaoYanSpider.score_pattern, root_htmls, re.S)
        score_new = []
        for sc in score:
            score_new.append(sc[0]+sc[1])
        
        info_movie = []
        for index in range(0,len(titles)):
            info_movie.append({
                'Title':titles[index],'Rank':rank[index],'Actor':actors_new[index],
                'Time':time[index],'Score':score_new[index],'Img':pics[index]
            })

        return info_movie

    def write_to_file(self,info_movie):
        
        with open('movie_rank_info.txt','a',encoding='utf-8') as f:
            contents = json.dumps(info_movie,ensure_ascii=False)
            f.write(contents+'\n')

    def main(self):
        
        if __name__ == '__main__':
            number = [0,10,20,30,40,50,60,70,80,90]
            for i in number:
                offset = i
                time.sleep(1)
                MaoYanSpider.url = MaoYanSpider.url + str(offset)
                print(MaoYanSpider.url)
                root_htmls = self.get_one_page()
                info_movie = self.refine(root_htmls)
                self.write_to_file(info_movie)
                MaoYanSpider.url = 'http://maoyan.com/board/4?offset='


maoyan = MaoYanSpider()
maoyan.main()