1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
| import os from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from urllib.parse import quote from pyquery import PyQuery as pq from pymongo import MongoClient import time
abspath = os.path.abspath(r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser = webdriver.Chrome(executable_path=abspath)
wait = WebDriverWait(browser,10) KEYWORD = 'iPad'
def index_page(page): """ 抓取索引页 :param page 页码 """ print('正在爬取第',page,'页') url_base = 'https://s.taobao.com/search?q=' url = url_base + quote(KEYWORD) print(url) try: browser.get(url) if page > 1:
input = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click()
wait.until(EC.text_to_be_present_in_element( (By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, '.m-itemlist .items .item'))) get_products() except TimeoutError: index_page(page)
def get_products(): """ 提取商品数据 """ htmls = browser.page_source doc = pq(htmls) items = doc('#mainsrp-itemlist .items .item').items() for item in items: product = { 'title':item.find('.title').text(), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text(), 'image':item.find('.pic .img').attr('data-src')
} print(product) save_to_mongo(product)
def save_to_mongo(result): """ 将数据保存到MongoDB数据库 :param result: 一条每个商品信息的数据 """ client = MongoClient(host='localhost', port=27017) db = client.admin collection = db.taobao try: if collection.insert(result): i = 1 print('存储成功'+': '+ str(i)) i += 1 except Exception: print('存储失败...')
def main(): """ 入口函数,并遍历每一页 """ MAX_PAGE = 100 for i in range(1,MAX_PAGE + 1): index_page(i) browser.close()
if __name__ == '__main__': main()
|