3. 数据存储

2018-07-12

网络爬虫

3. 数据存储

3.1 文件存储

3.1.1 TXT文本存储

以爬取知乎–发现–页面-今日最热-部分为例，爬取标题、回答者、答案，并保存到文本中。

""" 
爬取知乎今日最热问答 
"""
import requests
from urllib import error
import socket
from pyquery import PyQuery as pq
import  datetime

class ZhiHuEx():
    """ 
    知乎今日最热爬虫类 
    """
    url = 'https://www.zhihu.com/explore'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }

    def get_contents(self):
        """ 
        获取网页源码内容并实例化pyquery对象
         """
        try:
            rep = requests.get(ZhiHuEx.url, headers=ZhiHuEx.headers)
            htmls = rep.text
            doc = pq(htmls)
            return doc
        except error.HTTPError as e:
            print(e.reason)
        except error.URLError as e:
            if isinstance(e.reason, socket.timeout):
                print('time out')
            else:
                print(e.reason)

    def analy_data(self, doc):
        """ 
        获取question、author、answer内容并返回 
        """
        ##PyQuery对象,调用items()方法可得到一个生成器
        items = doc('.explore-feed.feed-item')
        for item in items.items():
            question = item('.question_link').text()
            author = item.find('.author-link-line').text()

            #由于答案部分的html代码,标签与文字混杂在一起,
            # 故先调用html()方法,将其标签格式等规范化，
            # 再构建pq对象，直接调用text()方法
            answer = pq(item.find('.content').html()).text()

            #调试用
            #an = pq(item.find('.content').html())
            #print(an, end='==========\n')
            #print(type(an))

            return question,author,answer

    def file_save(self,question,author,answer):

        """  
        将数据按照指定格式一条一条保存到本地txt 
        """
        i = datetime.datetime.now()
        number = (str(i.year)+'-'+str(i.month)+'-'+
            str(i.day)+'-'+str(i.hour)+'-'+str(i.minute)+'-'+str(i.second))
        file = open(number+'zhihu-explore.txt','a',encoding='utf-8')
        file.write('\n'.join([question, author, answer]))
        file.write('\n' + '='*50 + '\n')
        file.close()

        with open(number+'z-h-e.txt','a',encoding='utf-8') as file:
            file.write('\n'.join([question, author, answer]))
            file.write('\n' + '='*50 + '\n')

    
    def go(self):
        """ 入口函数 
        """
        doc = self.get_contents()
        question,author,answer = self.analy_data(doc)
        self.file_save(question,author,answer)

zhe = ZhiHuEx()
zhe.go()

3.1.2 Json文件存储日常使用操作

import json

# 将Json文本转换为Json对象：loads()

#json文本字符串需要 双引号 来表示，
# 否则loads会报错
str = '''[{
    "name":"hi",
    "age":12
    },{
    "name":"bg",
    "age":18
    }]'''
data = json.loads(str)#转换为json对象
print(data)
#[{'name': 'hi', 'age': 12}, {'name': 'bg', 'age': 18}]

#对json对象取值
print(data[0].get('name')) # hi
print(data[0].get('sex')) #None
print(data[0].get('sex','man')) #man
print(data[0]['name']) # hi

# 输出json(需调用dumps()方法)
with open('data.json','w') as file:
    #将json对象data转换为str，写入文本
    file.write(json.dumps(data))


#读取json

with open('data.json','r') as file:
    sa = file.read() #读到的是json字符串str
    ss = json.loads(sa)
    print(ss[1]['name']) #bg

#关于中文转码
#如何保存Json缩进空格格式

data2 = [{
    'name':'猴哥',
    'age':1000
}]
#参数indent代表缩进字符个数
with open('d2.json','w',encoding='utf-8') as file:
    file.write(json.dumps(data2, indent=2, ensure_ascii=False))

有格式的保存Json字符串到本地：

直接保存Json字符串到本地：

3.1.3 CSV文件存储

详见代码：

#自己写的py文件名，
# 不要和python模块名字相同！！！！
import csv

with open('text1.csv','a') as csvfile:
    #调用writer()方法,初始化写入对象，
    # 并传入文件句柄csvfile
    wr = csv.writer(csvfile)
    wr.writerow(['id','name','age'])
    wr.writerow(['1','Tom',12])

#写入的文本默认以逗号分隔，
# 若想修改列与列之间的分隔符，可传入delimiter参数
with open('t1.csv','w') as csvfile:
    wr2 = csv.writer(csvfile,delimiter=' ')
    wr2.writerow(['id','name','age'])
    #contents = [[1,'kitty',12],[2,'muak',8],[3,'cafie',23]]
    wr2.writerow(['1','Tom',12])
    wr2.writerow(['1','Tom',12])
    wr2.writerow(['1','Tom',12])


#writerows()写入多行,但参数需要为二维列表
with open('ts.csv','a') as file:
    wr3 = csv.writer(file)
    wr3.writerow(['id','name','age'])
    contents = [[1,'Paul',12],[2,'Gairly',18]]
    wr3.writerows(contents)


#字典dic的写入方式
with open('dic.csv','a') as file:
    d1 = {'id':1,'name':'Enum','age':34}
    d2 = {'id':2,'name':'Spliey','age':27}
    filename = [ 'id','name','age']
    #利用DictWriter()来初始化一个字典写入对象
    wd = csv.DictWriter(file,fieldnames=filename)
    wd.writeheader() #先写入头信息
    wd.writerow(d1)
    wd.writerow(d2)


#写入中文
with open('dzw.csv','a',encoding='utf-8') as file:
    dz = {'id':1,'name':'看嘛','age':12}
    filename1 = ['id','name','age']
    wd1 = csv.DictWriter(file, fieldnames=filename1)
    wd1.writeheader()
    wd1.writerow(dz)
    wd1.writerow(d1)


#读取csv文件
with open('dzw.csv','r',encoding='utf-8') as file:
    reader = csv.reader(file)
    for tx in reader:
        print(tx)

3.2 关系型数据库存储

关系型数据库如SQLite、MySQL、Oracle、SQL Server、DB2等，其数据库以表的形式存储；非关系型数据库如MongoDB、Redis，其存储形式是键值对，存储形式更加灵活。

3.2.1 MySQL存储

数据库以表的形式存储。

a. 连接数据库：

""" 
连接MySQL并建立一个spider数据库 
"""
import pymysql

db = pymysql.connect(host='localhost', user='root',
     password='123456',port=3306)
#获得MySQL的操作游标
cursor = db.cursor()
cursor.execute('SELECT VERSION()')
#利用fetchone()获得第一条数据
data = cursor.fetchone()
print('Database version: ', data)
sql1 = 'CREATE DATABASE spider DEFAULT CHARACTER SET utf8'
cursor.execute(sql1)
db.close()

b. 创建表：


# 创建表
import pymysql
#连接到MySQL某个数据库
db = pymysql.connect(host='localhost', user='root',
     password='123456',port=3306,db='spider')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS students (id VARCHAR(225) NOT NULL, \
       name VARCHAR(255) NOT NULL, age INT NOT NULL, PRIMARY KEY (id))'
cursor.execute(sql)
db.close()

c. 插入数据：

# 插入数据

import pymysql
id = '1005'
name = 'Kris'
age = 27

db = pymysql.connect(host='localhost',user='root',
     password='123456',port=3306,db='spider')
cursor = db.cursor()
sql = 'INSERT INTO students(id,name,age) values(%s, %s, %s)'
try:
    cursor.execute(sql,(id,name,age))
    #真正将语句提交到数据库执行的方法
    db.commit()
except:
    #执行失败就数据回滚 (涉及事务的问题：
    # 原子性、一致性、隔离性、持久性)
    db.rollback()
db.close()

d. 动态SQL语句:

#动态SQL语句
import pymysql

db = pymysql.connect(host='localhost',user='root',
        password='123456',port=3306,db='spider')
cursor = db.cursor()

data = {
    'id':'1003',
    'name':'Lily',
    'age':60
}
#print(data.values()) 
#dict_values(['1002', 'Uily', 18])
table = 'students'
keys = ','.join(data.keys()) #得到id,name,age
values = ','.join(['%s']*len(data))#['%s','%s','%s']
sql = ('INSERT INTO {table}({keys}) VALUES ({values})'.format(
         table=table,keys=keys,values=values))

try:
    if cursor.execute(sql,tuple(data.values())):
        print('success')
        db.commit()
except:
    print('Failed')
    db.rollback()
db.close()

e. 数据更新与去重：

#1.简单更新数据

import pymysql

db = pymysql.connect(host='localhost',user='root',
     password='123456',port=3306,db='spider')
cursor = db.cursor()
#更新name=Kris这条数据 age
sql = 'UPDATE students SET age=%s WHERE name=%s'
try:
    #简单更新，传入元组形式的参数
    cursor.execute(sql,(26,'Kris'))
    print('OK')
    db.commit()
except:
    db.rollback()
#db.close()

# 2.数据去重：数据存在则更新，不存在则插入
data = {
    'id':'1003',
    'name':'Lily2',
    'age':24
}
table = 'students'
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
#ON DUPLICATE KEY UPDATE：主键已经存在就执行更新操作
sql2 = ('INSERT INTO {table}({keys}) VALUES ({values}) ON \
DUPLICATE KEY UPDATE'.format(table=table,keys=keys,values=values))

update = ','.join(["{key} = %s".format(key=key) for key in data])
sql2 = sql2+' '+ update
print(sql2)
# sql2 = INSERT INTO students(id,name,age) VALUES(%s,$s,$s) ON 
# DUPLICATE KEY UPDATE id=%s,name=%s,age=%s
try:
    if cursor.execute(sql2,tuple(data.values())*2):
        print('Success')
        db.commit()
except:
    print('Failed')
    db.rollback()
db.close()

f. 删除数据：

# 删除数据

import pymysql
db = pymysql.connect(host='localhost',user='root',
        password='123456',port=3306,db='spider')
cursor = db.cursor()
table ='students'
condition = 'age > 25'
sql = 'DELETE FROM {table} WHERE \
      {condition}'.format(table=table,condition=condition)
try:
    cursor.execute(sql)
    db.commit()
    print('DELETE OK')
except:
    db.rollback()
db.close()

g. 查询数据:

# 查询数据

import pymysql

db = pymysql.connect(host='localhost',user='root',
       password='123456',port=3306,db='spider')
cursor = db.cursor()
sql = 'SELECT *FROM students WHERE age > 20'
try:
    cursor.execute(sql)
    count = cursor.rowcount
    print('结果条数：' + str(count))
    one = cursor.fetchone()
    print(one)
    #内部有偏移指针，取一条指针就移到下一条
    #fetchall()会将结果以tuple形式全部返回，
    #如果数据量很大很占开销，
    #建议while(),fetchone()方法
    rest = cursor.fetchall()
    print(type(rest))
    for row in rest:
        print(row)

except:
    db.rollback()
#db.close()

print('===fetchone()+while()方法====')

sql2 = 'SELECT *FROM students WHERE age>20'
try:
    cursor.execute(sql2)
    print('条数：'+str(cursor.rowcount))
    row = cursor.fetchone()
    while row:
        print('Row: ',row)
        row = cursor.fetchone()
except:
    print('error')

3.2.1 MongoDB存储

连接及简单的插入：

import pymongo

# 连接MongoDB数据库
#1. IP和port
client = pymongo.MongoClient(host='localhost',port=27017)
#2. 字符串
# client = pymongo.MongoClient('mongodb://localhost:27017/')

#指定连接的数据库
db = client.admin #第一种方法
# db = client['test'] #第二种方法

#指定要操作的集合(类似表)
collection = db.students

#插入单条数据
s1 = {
    'id':'20180722',
    'name':'Cerify',
    'gender':'female',
    'age':27
}
result = collection.insert(s1)
print(s1)

s2 = {
    'id':'20180721',
    'name':'Jasen',
    'gender':'',
    'age':17
}
s3 = {
    'id':'20170720',
    'name':'Puly',
    'age':56
}
r2 = collection.insert([s2,s3])
print(r2)

# 官方推荐使用insert_one()、insert_many()
# 二者返回对象是InsertOneResult对象(InsertManyResult)，
# 可调用inserted_id属性来获取id
r3 = collection.insert_one(s1)
print(type(r3))
r4 = collection.insert_many([s2,s3])
#print(r4)
#print(r3.inserted_ids)

一些基本操作：

import pymongo

#连接到mongodb数据库
client = pymongo.MongoClient(host='localhost',port=27017)

#连接到指定数据库
db = client.admin

#连接到指定结合(类似表)
collection = db.students

# 查询:find_one()返回的是单个结果
#find()返回的是一个生成器对象
r1 = collection.find_one({'name':'Puly'})#参数为dict
print(r1)#单个结果
r2 = collection.find({'name':'Cerify'})
print(r2)
for s in r2:
    print(s)

print('=== age<20 ===')

r3 = collection.find({'age':{'$lt':20}})
for s in r3:
    print(s)


print('===查询名字以J开头的学生数据===')
#正则匹配查询 $regex指定正则匹配
#查询名字以J开头的学生数据
r4 = collection.find({'name':{'$regex':'^J.*'}})
for s in r4:
    print(s)

# 计数
# 查询名字以 y 结尾的 数量
count = collection.find({'name':{'$regex':'.*y$'}}).count()
print(count)

print('==排序==')
# 排序
r5 = collection.find().sort('name',pymongo.ASCENDING)
print([result['name'] for result in r5])
""" for r in r5:
    print(r['age']) """


# 偏移
print('==偏移与限制==')
r6 = collection.find().sort('name',pymongo.ASCENDING).skip(6).limit(2)
print([result['name'] for result in r6])

更新与删除操作：

import pymongo

client = pymongo.MongoClient(host='localhost',port=27017)
db = client.admin
collection = db.students

#更新:update(),指定更新的条件和数据，
# 推荐使用update_one()和update_many()
condition = {'id':'20170729'}
s1 = collection.find_one(condition)
s1['age'] = 99
print(s1)
s1_modi = collection.update(condition,s1)
print(s1_modi)

#推荐用法
condition2 = {'id':'20180728'}
s2 = collection.find_one(condition2)
s2['age'] = 99
s2_modi = collection.update_one(condition2,{'$set':s2})
print(s2_modi)
print(s2_modi.matched_count, s2_modi.modified_count)

#查询年龄大于90的 且年龄加1操作

cond_old = {'age':{'$gt':90}} #年龄大于90
cond_new = {'$inc':{'age':1}} #年龄加1
r2 = collection.update_many(cond_old, cond_new )
print(type(r2))
print(r2.modified_count, r2.matched_count)


#删除
# 直接调用remove(),指定删除条件即可，
# 推荐delete_one()和delete_many()
r3 = collection.remove({'id':'1001'})
print(r3)

r4 = collection.delete_one({'id':'1002'})
print(r4.deleted_count)

r5 = collection.delete_many({'age':{'$lt':2}})
print(r5.deleted_count)

Python爬虫

3. 数据存储

3.1 文件存储

3.1.1 TXT文本存储

3.1.2 Json文件存储日常使用操作

3.1.3 CSV文件存储

3.2 关系型数据库存储

3.2.1 MySQL存储

3.2.1 MongoDB存储