十一、实战:原生爬虫

11 原生爬虫

11.1 爬虫前奏

爬虫前奏
a.明确目的
b.找到数据对应的网页
c.分析网页的结构找到数据所在的标签位置
d.模拟http请求,向服务器发送这个请求,获取到服务器返回给我们的html(返回的是完整的html数据);
用正则表达式提取我们需要的数据

11.2 爬虫及断点调试

a.断点打好,F5运行–F10单步–F5跳断点(从一个断点跳到下一个断点)–F11进入某个对象或函数的内部;
b. 作用:鼠标悬停在变量上方后会出现变量的状态,可看到相应的属性和值。

#####爬虫一:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from urllib import request
import re

class Spider():
url = 'https://www.panda.tv/cate/lol?pdt=1.24.s1.3.1loapcheq15'
# 设置中间 匹配 所有 且非贪婪
root_pattern = '<div class="video-info">[\s\S]*?</div>'
# __私有方法
def __fetch_content(self):
# 爬取网页的初始数据
r = request.urlopen(Spider.url)
#print(r)
htmls = r.read()
htmls = str(htmls, encoding='utf-8') # 得到的数据进行转码
a = 2
#print(htmls)
return htmls

# 对初始数据进行处理
def __analysis(self,htmls):
#得到所有names的数据列表:infos
infos = re.findall(Spider.root_pattern,htmls)
#print(type(infos))
#print(infos[1])
name_pattern = '</i>[\w\W]*?</span>'
number_pattern = '<span class="video-number">(.*)</span>'

for names in infos:
#提取处理
number_groups = re.search(number_pattern,names)
number = number_groups.group(1)

name_pre = re.findall(name_pattern,names)
# print(name_pre[0])
name_no_space = name_pre[0].replace('\n','')
name_space = re.search('</i>(.*)</span>',name_no_space)
name = name_space.group(1).replace(' ','')
print('主播:'+name+", 人气: "+number)

def go(self):
htmls = self.__fetch_content()
self.__analysis(htmls)

spider = Spider()
spider.go()

爬虫二:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from urllib import request
import re

class Spider():
url = 'https://www.panda.tv/cate/lol?pdt=1.24.s1.3.1loapcheq15'

# 设置中间 匹配 所有 且非贪婪 加() 去掉外层 不需要的数据
root_pattern = '<div class="video-info">([\s\S]*?)</div>'
name_pattern = '</i>([\s\S]*?)</span>'
number_pattern = '<span class="video-number">([\w\W]*?)</span>'
# __私有方法
def __fetch_content(self):
# 爬取网页的初始数据
r = request.urlopen(Spider.url)
#print(r)
htmls = r.read()
htmls = str(htmls, encoding='utf-8') # 得到的数据进行转码
a = 2
#print(htmls)
return htmls

# 对初始数据进行处理
def __analysis(self,htmls):
#得到所有names的数据列表:infos
root_htmls = re.findall(Spider.root_pattern,htmls)
name_list = []
number_list = []

for each_html in root_htmls:
#得到name的一个列表
# ['\n 七堇年华小七 ', '\n
# ']
name_ori = re.findall(Spider.name_pattern, each_html)
#从list name里获取str名字
# 七堇年华小七
name_ori = name_ori[0]
# 除去空格、换行,得到名字:七堇年华小七
name = name_ori.replace(' ','')
number = re.findall(Spider.number_pattern, each_html)
#从list number 得到str number
number = number[0]
#按照顺序分别添加到name_list、number_list
name_list.append(name)
number_list.append(number)
# print(type(name_space[0])) # str
# print(type(name_space)) # list
# print(name)

# 打印出来
for index in range(0,len(name_list)):
print(name_list[index]+': '+number_list[index])




def go(self):
htmls = self.__fetch_content()
self.__analysis(htmls)

spider = Spider()
spider.go()
爬虫三:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from urllib import request
import re

class Spider():
url = 'https://www.panda.tv/cate/lol?pdt=1.24.s1.3.1loapcheq15'

# 设置中间 匹配 所有 且非贪婪 加() 去掉外层 不需要的数据
root_pattern = '<div class="video-info">([\s\S]*?)</div>'
name_pattern = '</i>([\s\S]*?)</span>'
number_pattern = '<span class="video-number">([\w\W]*?)</span>'
# __私有方法
def __fetch_content(self):
# 爬取网页的初始数据
r = request.urlopen(Spider.url)
#print(r)
htmls = r.read()
htmls = str(htmls, encoding='utf-8') # 得到的数据进行转码
a = 2
#print(htmls)
return htmls

# 对初始数据进行处理
def __analysis(self,htmls):
#得到所有names的数据列表:infos
root_htmls = re.findall(Spider.root_pattern,htmls)
name_number_list = []

for each_html in root_htmls:
#得到name的一个列表
# ['\n 七堇年华小七 ', '\n
# ']
name_ori = re.findall(Spider.name_pattern, each_html)
#从list name里获取str名字
# 七堇年华小七
name_ori = name_ori[0]
# 除去空格,得到名字:七堇年华小七
name = name_ori.replace(' ','')
# 除去名字前的 \n :'\nSJY潇洒'
name = name.replace('\n','')
number = re.findall(Spider.number_pattern, each_html)
#从list number 得到str number
number = number[0]
# 添加进一个dict里,要注意字典的格式 key:value
name_number_dict = {'name':name, 'number':number}
#添加进;list 里
name_number_list.append(name_number_dict)

return name_number_list

def __refine(self,name_number_list):
for x in name_number_list:
print(x)

def go(self):
htmls = self.__fetch_content()
name_number_list = self.__analysis(htmls)
self.__refine(name_number_list)

spider = Spider()
spider.go()
最终爬虫(格式比较规范):

1.推荐平级的函数 ,清晰;
关于注释推荐:
a. ‘’’多行注释’’’ 写在函数里面,使用缩进;
b. 单行注释 推荐 写在 代码上一行 ,易于阅读;
c. 善于利用空行 方便阅读;
d. 倡导函数行数控制在10-20行之间,是函数小巧,提高复用性及阅读性;
2.中大型爬虫:
a. BeautifulSoup库,有简单方法帮助快速提炼内容;
b. Scrapy爬虫框架;
注意:要根据实际需要去学习框架!
3.在大段代码里找到某个函数:ctrl+shift+O

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
""" 
爬虫模块
"""
from urllib import request
import re

class Spider():
"""
爬虫类
"""
url = 'https://www.panda.tv/cate/lol?pdt=1.24.s1.3.1loapcheq15'

# 设置中间 匹配 所有 且非贪婪 加() 去掉外层 不需要的数据
root_pattern = '<div class="video-info">([\s\S]*?)</div>'
name_pattern = '</i>([\s\S]*?)</span>'
number_pattern = '<span class="video-number">([\w\W]*?)</span>'

def __fetch_content(self):
""" 抓取基本内容 __私有方法"""

# 爬取网页的初始数据
r = request.urlopen(Spider.url)
htmls = r.read()
htmls = str(htmls, encoding='utf-8') # 得到的数据进行转码

return htmls

def __analysis(self,htmls):
""" 对初始数据进行处理 """

#得到所有names的数据列表:infos
root_htmls = re.findall(Spider.root_pattern,htmls)
name_number_list = []

for each_html in root_htmls:
#得到name的一个列表
# ['\n 七堇年华小七 ', '\n
# ']
name = re.findall(Spider.name_pattern, each_html)
number = re.findall(Spider.number_pattern, each_html)

# 添加进一个dict里,要注意字典的格式 key:value
name_number_dict = {'name':name, 'number':number}

#添加进;list 里
name_number_list.append(name_number_dict)

return name_number_list

def __refine(self,name_number_list):
""" 进一步处理数据 """

# 将原始的name--number dict 的数据再处理一下:
# {'name': ['\n LOL丶摇摆哥 ', '\n
# '], 'number': ['7.3万']}
l = lambda each_name_number: {'name':each_name_number['name'][0].strip(),
'number':each_name_number['number'][0]}
return map(l, name_number_list)

def __sort(self,name_numbers):
""" 排序 """
name_numbers = sorted(name_numbers, key=self.__sort__seed, reverse = True)
return name_numbers

def __sort__seed(self, name_number):
""" 排序准备 """
nu = re.findall('\d*',name_number['number'])
number = float(nu[0])
if '万' in name_number['number']:
number = number * 10000

return number

def __show(self,name_numbers):
""" 展示数据 """

#count = 1
for rank in range(0,len(name_numbers)):
print('Rank: '+str(rank+1)+'--'+
name_numbers[rank]['name']+'----'+
name_numbers[rank]['number'])

# for name_number in name_numbers:
# print('Rank '+str(count) + '--'+ name_number['name']+'----'+name_number['number'])
# count += 1

def go(self):
""" 入口函数 """

htmls = self.__fetch_content()
name_number_list = self.__analysis(htmls)

# 将refine返回的map对象强制转换为list
name_numbers = list(self.__refine(name_number_list))
name_numbers = self.__sort(name_numbers)
self.__show(name_numbers)

spider = Spider()
spider.go()

分享到