3. 数据存储

3. 数据存储

3.1 文件存储

3.1.1 TXT文本存储

以爬取知乎–发现–页面-今日最热-部分为例,爬取标题、回答者、答案,并保存到文本中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
""" 
爬取知乎今日最热问答
"""
import requests
from urllib import error
import socket
from pyquery import PyQuery as pq
import datetime

class ZhiHuEx():
"""
知乎今日最热爬虫类
"""
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}

def get_contents(self):
"""
获取网页源码内容并实例化pyquery对象
"""
try:
rep = requests.get(ZhiHuEx.url, headers=ZhiHuEx.headers)
htmls = rep.text
doc = pq(htmls)
return doc
except error.HTTPError as e:
print(e.reason)
except error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('time out')
else:
print(e.reason)

def analy_data(self, doc):
"""
获取question、author、answer内容并返回
"""
##PyQuery对象,调用items()方法可得到一个生成器
items = doc('.explore-feed.feed-item')
for item in items.items():
question = item('.question_link').text()
author = item.find('.author-link-line').text()

#由于答案部分的html代码,标签与文字混杂在一起,
# 故先调用html()方法,将其标签格式等规范化,
# 再构建pq对象,直接调用text()方法
answer = pq(item.find('.content').html()).text()

#调试用
#an = pq(item.find('.content').html())
#print(an, end='==========\n')
#print(type(an))

return question,author,answer

def file_save(self,question,author,answer):

"""
将数据按照指定格式一条一条保存到本地txt
"""
i = datetime.datetime.now()
number = (str(i.year)+'-'+str(i.month)+'-'+
str(i.day)+'-'+str(i.hour)+'-'+str(i.minute)+'-'+str(i.second))
file = open(number+'zhihu-explore.txt','a',encoding='utf-8')
file.write('\n'.join([question, author, answer]))
file.write('\n' + '='*50 + '\n')
file.close()

with open(number+'z-h-e.txt','a',encoding='utf-8') as file:
file.write('\n'.join([question, author, answer]))
file.write('\n' + '='*50 + '\n')


def go(self):
""" 入口函数
"""
doc = self.get_contents()
question,author,answer = self.analy_data(doc)
self.file_save(question,author,answer)

zhe = ZhiHuEx()
zhe.go()

3.1.2 Json文件存储日常使用操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json

# 将Json文本转换为Json对象:loads()

#json文本字符串需要 双引号 来表示,
# 否则loads会报错
str = '''[{
"name":"hi",
"age":12
},{
"name":"bg",
"age":18
}]'''
data = json.loads(str)#转换为json对象
print(data)
#[{'name': 'hi', 'age': 12}, {'name': 'bg', 'age': 18}]

#对json对象取值
print(data[0].get('name')) # hi
print(data[0].get('sex')) #None
print(data[0].get('sex','man')) #man
print(data[0]['name']) # hi

# 输出json(需调用dumps()方法)
with open('data.json','w') as file:
#将json对象data转换为str,写入文本
file.write(json.dumps(data))


#读取json

with open('data.json','r') as file:
sa = file.read() #读到的是json字符串str
ss = json.loads(sa)
print(ss[1]['name']) #bg

#关于中文转码
#如何保存Json缩进空格格式

data2 = [{
'name':'猴哥',
'age':1000
}]
#参数indent代表缩进字符个数
with open('d2.json','w',encoding='utf-8') as file:
file.write(json.dumps(data2, indent=2, ensure_ascii=False))

有格式的保存Json字符串到本地:

直接保存Json字符串到本地:

3.1.3 CSV文件存储

详见代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#自己写的py文件名,
# 不要和python模块名字相同!!!!
import csv

with open('text1.csv','a') as csvfile:
#调用writer()方法,初始化写入对象,
# 并传入文件句柄csvfile
wr = csv.writer(csvfile)
wr.writerow(['id','name','age'])
wr.writerow(['1','Tom',12])

#写入的文本默认以逗号分隔,
# 若想修改列与列之间的分隔符,可传入delimiter参数
with open('t1.csv','w') as csvfile:
wr2 = csv.writer(csvfile,delimiter=' ')
wr2.writerow(['id','name','age'])
#contents = [[1,'kitty',12],[2,'muak',8],[3,'cafie',23]]
wr2.writerow(['1','Tom',12])
wr2.writerow(['1','Tom',12])
wr2.writerow(['1','Tom',12])


#writerows()写入多行,但参数需要为二维列表
with open('ts.csv','a') as file:
wr3 = csv.writer(file)
wr3.writerow(['id','name','age'])
contents = [[1,'Paul',12],[2,'Gairly',18]]
wr3.writerows(contents)


#字典dic的写入方式
with open('dic.csv','a') as file:
d1 = {'id':1,'name':'Enum','age':34}
d2 = {'id':2,'name':'Spliey','age':27}
filename = [ 'id','name','age']
#利用DictWriter()来初始化一个字典写入对象
wd = csv.DictWriter(file,fieldnames=filename)
wd.writeheader() #先写入头信息
wd.writerow(d1)
wd.writerow(d2)


#写入中文
with open('dzw.csv','a',encoding='utf-8') as file:
dz = {'id':1,'name':'看嘛','age':12}
filename1 = ['id','name','age']
wd1 = csv.DictWriter(file, fieldnames=filename1)
wd1.writeheader()
wd1.writerow(dz)
wd1.writerow(d1)


#读取csv文件
with open('dzw.csv','r',encoding='utf-8') as file:
reader = csv.reader(file)
for tx in reader:
print(tx)

3.2 关系型数据库存储

关系型数据库如SQLite、MySQL、Oracle、SQL Server、DB2等,其数据库以表的形式存储;非关系型数据库如MongoDB、Redis,其存储形式是键值对,存储形式更加灵活。

3.2.1 MySQL存储

数据库以表的形式存储。

a. 连接数据库:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
""" 
连接MySQL并建立一个spider数据库
"""
import pymysql

db = pymysql.connect(host='localhost', user='root',
password='123456',port=3306)
#获得MySQL的操作游标
cursor = db.cursor()
cursor.execute('SELECT VERSION()')
#利用fetchone()获得第一条数据
data = cursor.fetchone()
print('Database version: ', data)
sql1 = 'CREATE DATABASE spider DEFAULT CHARACTER SET utf8'
cursor.execute(sql1)
db.close()

b. 创建表:

1
2
3
4
5
6
7
8
9
10
11

# 创建表
import pymysql
#连接到MySQL某个数据库
db = pymysql.connect(host='localhost', user='root',
password='123456',port=3306,db='spider')
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS students (id VARCHAR(225) NOT NULL, \
name VARCHAR(255) NOT NULL, age INT NOT NULL, PRIMARY KEY (id))'
cursor.execute(sql)
db.close()

c. 插入数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 插入数据

import pymysql
id = '1005'
name = 'Kris'
age = 27

db = pymysql.connect(host='localhost',user='root',
password='123456',port=3306,db='spider')
cursor = db.cursor()
sql = 'INSERT INTO students(id,name,age) values(%s, %s, %s)'
try:
cursor.execute(sql,(id,name,age))
#真正将语句提交到数据库执行的方法
db.commit()
except:
#执行失败就数据回滚 (涉及事务的问题:
# 原子性、一致性、隔离性、持久性)
db.rollback()
db.close()

d. 动态SQL语句:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#动态SQL语句
import pymysql

db = pymysql.connect(host='localhost',user='root',
password='123456',port=3306,db='spider')
cursor = db.cursor()

data = {
'id':'1003',
'name':'Lily',
'age':60
}
#print(data.values())
#dict_values(['1002', 'Uily', 18])
table = 'students'
keys = ','.join(data.keys()) #得到id,name,age
values = ','.join(['%s']*len(data))#['%s','%s','%s']
sql = ('INSERT INTO {table}({keys}) VALUES ({values})'.format(
table=table,keys=keys,values=values))

try:
if cursor.execute(sql,tuple(data.values())):
print('success')
db.commit()
except:
print('Failed')
db.rollback()
db.close()

e. 数据更新与去重:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#1.简单更新数据

import pymysql

db = pymysql.connect(host='localhost',user='root',
password='123456',port=3306,db='spider')
cursor = db.cursor()
#更新name=Kris这条数据 age
sql = 'UPDATE students SET age=%s WHERE name=%s'
try:
#简单更新,传入元组形式的参数
cursor.execute(sql,(26,'Kris'))
print('OK')
db.commit()
except:
db.rollback()
#db.close()

# 2.数据去重:数据存在则更新,不存在则插入
data = {
'id':'1003',
'name':'Lily2',
'age':24
}
table = 'students'
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
#ON DUPLICATE KEY UPDATE:主键已经存在就执行更新操作
sql2 = ('INSERT INTO {table}({keys}) VALUES ({values}) ON \
DUPLICATE KEY UPDATE'.format(table=table,keys=keys,values=values))

update = ','.join(["{key} = %s".format(key=key) for key in data])
sql2 = sql2+' '+ update
print(sql2)
# sql2 = INSERT INTO students(id,name,age) VALUES(%s,$s,$s) ON
# DUPLICATE KEY UPDATE id=%s,name=%s,age=%s
try:
if cursor.execute(sql2,tuple(data.values())*2):
print('Success')
db.commit()
except:
print('Failed')
db.rollback()
db.close()

f. 删除数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 删除数据

import pymysql
db = pymysql.connect(host='localhost',user='root',
password='123456',port=3306,db='spider')
cursor = db.cursor()
table ='students'
condition = 'age > 25'
sql = 'DELETE FROM {table} WHERE \
{condition}'.format(table=table,condition=condition)
try:
cursor.execute(sql)
db.commit()
print('DELETE OK')
except:
db.rollback()
db.close()

g. 查询数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# 查询数据

import pymysql

db = pymysql.connect(host='localhost',user='root',
password='123456',port=3306,db='spider')
cursor = db.cursor()
sql = 'SELECT *FROM students WHERE age > 20'
try:
cursor.execute(sql)
count = cursor.rowcount
print('结果条数:' + str(count))
one = cursor.fetchone()
print(one)
#内部有偏移指针,取一条指针就移到下一条
#fetchall()会将结果以tuple形式全部返回,
#如果数据量很大很占开销,
#建议while(),fetchone()方法
rest = cursor.fetchall()
print(type(rest))
for row in rest:
print(row)

except:
db.rollback()
#db.close()

print('===fetchone()+while()方法====')

sql2 = 'SELECT *FROM students WHERE age>20'
try:
cursor.execute(sql2)
print('条数:'+str(cursor.rowcount))
row = cursor.fetchone()
while row:
print('Row: ',row)
row = cursor.fetchone()
except:
print('error')

3.2.1 MongoDB存储

连接及简单的插入:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pymongo

# 连接MongoDB数据库
#1. IP和port
client = pymongo.MongoClient(host='localhost',port=27017)
#2. 字符串
# client = pymongo.MongoClient('mongodb://localhost:27017/')

#指定连接的数据库
db = client.admin #第一种方法
# db = client['test'] #第二种方法

#指定要操作的集合(类似表)
collection = db.students

#插入单条数据
s1 = {
'id':'20180722',
'name':'Cerify',
'gender':'female',
'age':27
}
result = collection.insert(s1)
print(s1)

s2 = {
'id':'20180721',
'name':'Jasen',
'gender':'',
'age':17
}
s3 = {
'id':'20170720',
'name':'Puly',
'age':56
}
r2 = collection.insert([s2,s3])
print(r2)

# 官方推荐使用insert_one()、insert_many()
# 二者返回对象是InsertOneResult对象(InsertManyResult),
# 可调用inserted_id属性来获取id
r3 = collection.insert_one(s1)
print(type(r3))
r4 = collection.insert_many([s2,s3])
#print(r4)
#print(r3.inserted_ids)

一些基本操作:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pymongo

#连接到mongodb数据库
client = pymongo.MongoClient(host='localhost',port=27017)

#连接到指定数据库
db = client.admin

#连接到指定结合(类似表)
collection = db.students

# 查询:find_one()返回的是单个结果
#find()返回的是一个生成器对象
r1 = collection.find_one({'name':'Puly'})#参数为dict
print(r1)#单个结果
r2 = collection.find({'name':'Cerify'})
print(r2)
for s in r2:
print(s)

print('=== age<20 ===')

r3 = collection.find({'age':{'$lt':20}})
for s in r3:
print(s)


print('===查询名字以J开头的学生数据===')
#正则匹配查询 $regex指定正则匹配
#查询名字以J开头的学生数据
r4 = collection.find({'name':{'$regex':'^J.*'}})
for s in r4:
print(s)

# 计数
# 查询名字以 y 结尾的 数量
count = collection.find({'name':{'$regex':'.*y$'}}).count()
print(count)

print('==排序==')
# 排序
r5 = collection.find().sort('name',pymongo.ASCENDING)
print([result['name'] for result in r5])
""" for r in r5:
print(r['age']) """


# 偏移
print('==偏移与限制==')
r6 = collection.find().sort('name',pymongo.ASCENDING).skip(6).limit(2)
print([result['name'] for result in r6])

更新与删除操作:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pymongo

client = pymongo.MongoClient(host='localhost',port=27017)
db = client.admin
collection = db.students

#更新:update(),指定更新的条件和数据,
# 推荐使用update_one()和update_many()
condition = {'id':'20170729'}
s1 = collection.find_one(condition)
s1['age'] = 99
print(s1)
s1_modi = collection.update(condition,s1)
print(s1_modi)

#推荐用法
condition2 = {'id':'20180728'}
s2 = collection.find_one(condition2)
s2['age'] = 99
s2_modi = collection.update_one(condition2,{'$set':s2})
print(s2_modi)
print(s2_modi.matched_count, s2_modi.modified_count)

#查询年龄大于90的 且年龄加1操作

cond_old = {'age':{'$gt':90}} #年龄大于90
cond_new = {'$inc':{'age':1}} #年龄加1
r2 = collection.update_many(cond_old, cond_new )
print(type(r2))
print(r2.modified_count, r2.matched_count)


#删除
# 直接调用remove(),指定删除条件即可,
# 推荐delete_one()和delete_many()
r3 = collection.remove({'id':'1001'})
print(r3)

r4 = collection.delete_one({'id':'1002'})
print(r4.deleted_count)

r5 = collection.delete_many({'age':{'$lt':2}})
print(r5.deleted_count)
分享到