5-Selenium与Splash的使用

5. Selenium与Splash的使用

####5.1 Selenium的使用

  1. 查找节点、节点交互操作
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from selenium import webdriver
import os
import time

abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
#将浏览器对象初始化,并赋值给browser
browser = webdriver.Chrome(executable_path=abspath)

url = 'https://www.taobao.com/'
#模拟浏览器动作
browser.get(url)
#获取网页源代码
root_htmls = browser.page_source
#print(root_htmls)
#browser.close()

# 查找单个节点
# 获取输入框
input_first = browser.find_element_by_id('q')
# input_sec = browser.find_element_by_css_selector('#q')
# input_third = browser.find_element_by_xpath('//*[@id="q"]')

#查找多个节点
navs = browser.find_elements_by_css_selector('.nav-hd li')
print(navs)

# 节点交互
#输入文字
input_first.send_keys('iPhone')
time.sleep(3) # 等待3秒
input_first.clear()#输入框情况
input_first.send_keys('泳镜')#再次输入文字
# 获取搜索按钮
but = browser.find_element_by_class_name('btn-search')
but.click() #点击,完成搜索动作
time.sleep(8) #睡眠8秒
browser.close()#关闭浏览器
  1. 动作链
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from selenium import webdriver
import os
from selenium.webdriver import ActionChains

abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser = webdriver.Chrome(executable_path=abspath)

#动作链
#实现一个节点的拖曳操作
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)

#切换到iframe(相当于页面的子页面)模块
browser.switch_to.frame('iframeResult')

#依次选中要拖曳的节点和拖曳到的目标节点
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')

actions = ActionChains(browser)
actions.drag_and_drop(source, target)
actions.perform()
  1. 获取节点信息
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from selenium import webdriver
import os


#1. 有些操作,Selenium API并没有提供,
# 可通过模拟执行JavaScript来实现
abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser = webdriver.Chrome(executable_path=abspath)

url = 'https://www.zhihu.com/explore'
browser.get(url)

browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')


#2. 获取节点信息
#获取属性
logo = browser.find_element_by_id('zh-top-link-logo')
print(logo.text)
#print(logo.get_attribute('class'))

#获取文本
# 先获取‘提问’按钮这个节点
input1 = browser.find_element_by_class_name('zu-top-add-question')
print(input1.text)

#获取id、位置、标签名和大小
print(input1.id)#节点id
print(input1.location)#节点在页面的相对位置
print(input1.tag_name)#获取标签名称
print(input1.size)#获取节点大小
  1. 切换frame
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import os

#切换Frame
#iframe是网页的一种节点,是子Frame,相当于页面的子页面,
# 可用switch_to.frame切换到iframe,
# 切换到Frame则用switch_to.parent_frame()
abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser = webdriver.Chrome(executable_path=abspath)
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
try:
logo = browser.find_element_by_class_name('logo')
print('iframe logo')
except NoSuchElementException:
print('No iframe\'s class is logo')

browser.switch_to.parent_frame()
try:
#class="navbar-header logo" 这里选logo也可以
logo = browser.find_element_by_class_name('navbar-header')
print(logo.text)
except NoSuchElementException:
print('no no no')
  1. 延时等待
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

#延时等待
# 延时等待一段时间,确保某些网页额外的Ajax请求等完全加载完,
# 确保所有节点已经加载出来,再获取page_source

abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser = webdriver.Chrome(executable_path=abspath)
url = 'https://www.zhihu.com/explore'

#1. 隐式等待
browser.implicitly_wait(10)
browser.get(url)
inp = browser.find_element_by_class_name('zu-top-add-question')
print(inp.text)
#页面加载时间会受到网络条件的限制,
# 而隐式等待只规定了一个固定时间,太死板,不太好


print('==显示等待==')
#2. 显示等待
# 指定一个最长等待时间,时间内找到就返回,超出时间就抛异常
br2 = webdriver.Chrome(executable_path=abspath)
br2.get(url)

wait = WebDriverWait(br2, 10)
#10秒内等待,看能否找到ID为q的节点(搜索框),找不到就抛异常
inp2 = wait.until(EC.presence_of_element_located((By.ID, 'q')))
#CSS选择器为.btn-search是否可点击
but = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.zu-top-search-button')))
print(inp2, but)
  1. 前进和后退、Cookies操作及选项卡操作
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import time,os
from selenium import webdriver
abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
br1 = webdriver.Chrome(executable_path=abspath)

#1. 前进和后退
u1 = 'https://www.baidu.com/'
u2 = 'https://taobao.com/'
u3 = 'https://douban.com/'
br1.get(u1)
br1.get(u2)
br1.get(u3)
br1.back()
time.sleep(2)
br1.back()
time.sleep(1)
br1.forward()


#2. 对Cookies进行操作
u4 = 'https://www.zhihu.com/explore'
br2 = webdriver.Chrome(executable_path=abspath)
br2.get(u4)
print(br2.get_cookies())

# cookie设置的不合法?没看到这条cookie啊..
br2.add_cookie({'name':'cat','domain':'dog','value':'why'})
cookies = br2.get_cookies()
for cookie in cookies:
print(cookie)#没看到这条cookie啊..
print(br2.get_cookies())
br2.delete_all_cookies()
print(br2.get_cookies())

# 3. 选项卡管理

brs1 = webdriver.Chrome(executable_path=abspath)
url1 = 'http://www.baidu.com'
url2 = 'https://taobao.com/'
url3 = 'https://douban.com/'
brs1.get(url1)

brs1.execute_script('window.open()')
print(brs1.window_handles)
brs1.switch_to_window(brs1.window_handles[1])
brs1.get(url2)
time.sleep(3)
brs1.switch_to_window(brs1.window_handles[0])
brs1.get(url3)


#4. 异常处理
#就是:
""" try:
pass
except expression as identifier:
pass """

爬取某猫商品信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq
from pymongo import MongoClient
import time

abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
#初始化浏览器对象,赋值给browser
browser = webdriver.Chrome(executable_path=abspath)
#设置显式等待,最大时长10秒
wait = WebDriverWait(browser,10)
KEYWORD = 'iPad' #设置搜索的关键词

def index_page(page):
"""
抓取索引页
:param page 页码
"""
print('正在爬取第',page,'页')
url_base = 'https://s.taobao.com/search?q='
url = url_base + quote(KEYWORD)
print(url)
try:
browser.get(url)
if page > 1:

# 获取页码输入按钮
input = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))

# 获取页码跳转 button
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page)
submit.click()

#确保页码有跳转到指定页码page(通过检测高亮页码的节点)
#参数:定位器,文本
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))

#确保每个商品信息模块都被加载出来
wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '.m-itemlist .items .item')))

get_products()

except TimeoutError:
index_page(page)


def get_products():
"""
提取商品数据
"""
htmls = browser.page_source
#print(htmls)
doc = pq(htmls)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'title':item.find('.title').text(),
'price':item.find('.price').text(),
'deal':item.find('.deal-cnt').text(),
'shop':item.find('.shop').text(),
'location':item.find('.location').text(),
'image':item.find('.pic .img').attr('data-src')

}
print(product)
save_to_mongo(product)


def save_to_mongo(result):
"""
将数据保存到MongoDB数据库
:param result: 一条每个商品信息的数据
"""
client = MongoClient(host='localhost', port=27017)
db = client.admin
collection = db.taobao
try:
if collection.insert(result):
i = 1
print('存储成功'+': '+ str(i))
i += 1
except Exception:
print('存储失败...')


def main():
"""
入口函数,并遍历每一页
"""
MAX_PAGE = 100
for i in range(1,MAX_PAGE + 1):
index_page(i)
browser.close()

if __name__ == '__main__':
main()
分享到