数据
具体数据和代码在网盘:链接: https://pan.baidu.com/s/1qYyAyvi密码: rnbj
相关文件: 01_user_goods_score.data。
数据中有3个字段分别使用逗号(,)隔开,如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
用户
,商品
,评分
user1
,
101
,
5.0
user1
,
102
,
3.0
user1
,
103
,
2.5
user2
,
101
,
2.0
user2
,
102
,
2.5
user2
,
103
,
5.0
user2
,
104
,
2.0
user3
,
101
,
2.0
user3
,
104
,
4.0
user3
,
105
,
4.5
user3
,
107
,
5.0
user4
,
101
,
5.0
user4
,
103
,
3.0
user4
,
104
,
4.5
user4
,
106
,
4.0
user5
,
101
,
4.0
user5
,
102
,
3.0
user5
,
103
,
2.0
user5
,
104
,
4.0
user5
,
105
,
3.5
user5
,
106
,
4.0
|
获得有多少个唯一的商品
使用源数据(01_user_goods_score.data), 通过MRJob计算出唯一的商品。
MRJob代码(pandas_01_goods.py)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from
mrjob
.
job
import
MRJob
class
Pandas01GoodsIndex
(
MRJob
)
:
""
"商品用户评分矩阵"
""
def
mapper
(
self
,
_
,
line
)
:
# 解析行: 用户, 商品, 评分
user
,
goods
,
score
=
line
.
split
(
','
)
yield
goods
,
None
def
reducer
(
self
,
key
,
values
)
:
yield
key
,
1
def
main
(
)
:
Pandas01GoodsIndex
.
run
(
)
if
__name_
_
==
'__main__'
:
main
(
)
|
执行
1
2
3
4
5
6
7
8
9
|
python
pandas_01_goods
.
py
01_user_goods_score.data
>
pandas_01_goods
.
data
cat
pandas_01_goods
.
data
"101"
1
"102"
1
"103"
1
"104"
1
"105"
1
"106"
1
"107"
1
|
注意: 上面输出的结果其中后面的 1 是没有用的
获得有多少个唯一的用户
方法和计算出唯一的商品是类似的, 同样也是使用(01_user_goods_score.data)中的数据通过MRJob分析得出
MRJob代码(pandas_01_user.py)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from
mrjob
.
job
import
MRJob
class
Pandas01GoodsIndex
(
MRJob
)
:
""
"商品用户评分矩阵"
""
def
mapper
(
self
,
_
,
line
)
:
# 解析行: 用户, 商品, 评分
user
,
goods
,
score
=
line
.
split
(
','
)
yield
user
,
None
def
reducer
(
self
,
key
,
values
)
:
yield
key
,
1
def
main
(
)
:
Pandas01GoodsIndex
.
run
(
)
if
__name_
_
==
'__main__'
:
main
(
)
|
执行
1
2
3
4
5
6
7
|
python
pandas_01_user
.
py
01_user_goods_score.data
>
pandas_01_user
.
data
cat
pandas_01_user
.
data
"user1"
1
"user2"
1
"user3"
1
"user4"
1
"user5"
1
|
计算用户推荐商品
思路:
- 将商品(pandas_01_goods.data)转化成 key value 对(其中value是矩阵的下角标),如下:
1
2
3
4
5
6
7
8
9
|
goods
_dict
=
{
'101'
:
0
,
'102'
:
1
,
'103'
:
2
,
'104'
:
3
,
'105'
:
4
,
'106'
:
5
,
'107'
:
6
,
}
|
- 将用户(pandas_01_user.data)转化成 key value 对(其中value是矩阵的下角标),如下:
1
2
3
4
5
6
7
|
user
_dict
=
{
'user1'
:
0
,
'user2'
:
1
,
'user3'
:
2
,
'user4'
:
3
,
'user5'
:
4
,
}
|
- 这边通过读取(02_user_goods_score_record.data)中的数据, 并且计算出商品的购买矩阵。
这边使用了 Python 的数据分析框架 scipy
在生成矩阵的时候使用的是 goods_dict 中的值来代替的, 到了最后在替换回来
- 通过读取(01_user_goods_score.data)中的数据, 并且计算出用户商品评分矩阵(用户只购买评分过的商品)。
在生成矩阵的时候使用的是 user_dict 中的值来代替的, 到了最后在替换回来
- <商品购买矩阵> X <用户商品评分矩阵> = <每个用户每个商品评分矩阵>
- 将 <每个用户每个商品评分矩阵> 结合 Pandas、goods_dict、user_dict 得出最终结果
计算物品推荐代码(pandas_02_final.py)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from
scipy
import
sparse
import
numpy
as
np
import
pandas
as
pd
class
Pandas02Final
(
object
)
:
""
"计算物品推荐数据"
""
def
__init__
(
self
)
:
self
.
goods
_dict
=
{
}
self
.
user
_dict
=
{
}
self
.
goods_bought_count
_matrix
=
None
self
.
user_goods_score
_matrix
=
None
self
.
every_user_goods_score
_matrix
=
None
self
.
every_user_goods_score
_df
=
None
def
get_goods_dict
(
self
,
file_name
)
:
""
"通过读取文件从而获得商品字典"
""
with
open
(
file_name
)
as
f
:
for
index
,
line
in
enumerate
(
f
)
:
items
=
line
.
split
(
'"'
)
goods
=
items
[
1
]
self
.
goods_dict
[
goods
]
=
index
def
get_user_dict
(
self
,
file_name
)
:
""
"通过读取文件从而获得用户字典"
""
with
open
(
file_name
)
as
f
:
for
index
,
line
in
enumerate
(
f
)
:
items
=
line
.
split
(
'"'
)
user
=
items
[
1
]
self
.
user_dict
[
user
]
=
index
def
get_goods_bought_count_matrix
(
self
,
file_name
)
:
""
"获得商品购买次数矩阵"
""
# 定义稀疏矩阵的行, 列, 值
row
_indices
=
[
]
# 行是商品
col
_indices
=
[
]
# 列是商品
values
=
[
]
# 值是 1
with
open
(
file_name
)
as
f
:
for
line
in
f
:
# 获得 用户 商品:评分,商品:评分...
items
=
line
.
split
(
'"'
)
goods
_scores
=
items
[
3
]
for
goods
_score
in
goods_scores
.
split
(
','
)
:
goods
_row
=
goods_score
.
split
(
':'
)
[
0
]
# 获得行的商品
for
goods
_score
in
goods_scores
.
split
(
','
)
:
goods
_col
=
goods_score
.
split
(
':'
)
[
0
]
# 获得列的商品
# 添加矩阵的 行 列 值
row_indices
.
append
(
self
.
goods_dict
[
goods_row
]
)
# 使用 goods_dict 的值代替
col_indices
.
append
(
self
.
goods_dict
[
goods_col
]
)
# 使用 goods_dict 的值代替
values
.
append
(
1
)
# 值为1
# row_indices col_indices values 这三个变量中的值满足了稀疏矩阵的值,
# 通过这三个变量构造矩阵
row
_indices
=
np
.
array
(
row_indices
)
col
_indices
=
np
.
array
(
col_indices
)
values
=
np
.
array
(
values
)
# 生成矩阵(同现矩阵)
self
.
goods_bought_count
_matrix
=
sparse
.
coo_matrix
(
(
values
,
(
row_indices
,
col_indices
)
)
,
shape
=
(
len
(
self
.
goods_dict
)
,
len
(
self
.
goods_dict
)
)
)
.
todense
(
)
def
get_user_goods_score_matrix
(
self
,
file_name
)
:
""
"获得用户商品评分矩阵(仅仅是用户购买购买过的商品)"
""
# 定义稀疏矩阵的行, 列, 值
row
_indices
=
[
]
# 行是商品
col
_indices
=
[
]
# 列是用户
values
=
[
]
# 值是用户对商品的评分(score)
with
open
(
file_name
)
as
f
:
for
line
in
f
:
user
,
goods
,
score
=
line
.
split
(
','
)
# 添加矩阵的 行 列 值
row_indices
.
append
(
self
.
goods_dict
[
goods
]
)
# 使用 goods_dict 的值代替
col_indices
.
append
(
self
.
user_dict
[
user
]
)
# 使用 user_dict 的值代替
values
.
append
(
float
(
score
)
)
# 值为用户评分(score)
# row_indices col_indices values 这三个变量中的值满足了稀疏矩阵的值,
# 通过这三个变量构造矩阵
row
_indices
=
np
.
array
(
row_indices
)
col
_indices
=
np
.
array
(
col_indices
)
values
=
np
.
array
(
values
)
# 生成矩阵(同现矩阵)
self
.
user_goods_score
_matrix
=
sparse
.
coo_matrix
(
(
values
,
(
row_indices
,
col_indices
)
)
,
shape
=
(
len
(
self
.
goods_dict
)
,
len
(
self
.
user_dict
)
)
)
.
todense
(
)
def
get_every_user_goods_score_matrix
(
self
)
:
""
"获得每个用户每个商品的评分情况
计算公式: <商品购买次数矩阵> X <用户商品评分矩阵>
"
""
self
.
every_user_goods_score
_matrix
=
(
self
.
goods_bought_count
_matrix
*
self
.
user_goods_score_matrix
)
def
get_every_user_goods_score_df
(
self
)
:
""
"<每个用户每个商品评分矩阵> 和 Pandas, goods_dict, user_dict 结合获得最终的数据"
""
self
.
every_user_goods_score
_df
=
pd
.
DataFrame
(
self
.
every_user_goods_score_matrix
,
columns
=
sorted
(
self
.
user_dict
)
,
# 用户为列
index
=
sorted
(
self
.
goods_dict
)
)
# 商品为索引
def
main
(
)
:
pandas_02
_final
=
Pandas02Final
(
)
# 1. 获得商品数据字典
pandas_02_final
.
get_goods_dict
(
'pandas_01_goods.data'
)
print
'=============================================='
print
'1. 获得商品数据字典'
print
'=============================================='
print
pandas_02_final
.
goods
_dict
# 2. 获得用户数据字典
pandas_02_final
.
get_user_dict
(
'pandas_01_user.data'
)
print
'=============================================='
print
'2. 获得用户数据字典'
print
'=============================================='
print
pandas_02_final
.
user
_dict
# 3. 获得商品购买次数矩阵
pandas_02_final
.
get_goods_bought_count_matrix
(
'02_user_goods_score_record.data'
)
print
'=============================================='
print
'3. 获得商品购买次数矩阵'
print
'=============================================='
print
pandas_02_final
.
goods_bought_count
_matrix
# 4. 获得用户商品评分矩矩阵(仅仅包含用户买过的商品)
pandas_02_final
.
get_user_goods_score_matrix
(
'01_user_goods_score.data'
)
print
'=============================================='
print
'4. 获得用户商品评分矩矩阵(仅仅包含用户买过的商品)'
print
'=============================================='
print
pandas_02_final
.
user_goods_score
_matrix
# 5. 两个矩阵相乘获得最终的 <每个用户每个商品评分矩阵>
pandas_02_final
.
get_every_user_goods_score_matrix
(
)
print
'=============================================='
print
'5. 两个矩阵相乘获得最终的 <每个用户每个商品评分矩阵>'
print
'=============================================='
print
pandas_02_final
.
every_user_goods_score
_matrix
# 6. <每个用户每个商品评分矩阵> 和 Pandas, goods_dict, user_dict 结合获得最终的数据
pandas_02_final
.
get_every_user_goods_score_df
(
)
print
'=============================================='
print
'6. <每个用户每个商品评分矩阵> 和 Pandas, goods_dict, user_dict 结合获得最终的数据'
print
'=============================================='
print
pandas_02_final
.
every_user_goods_score
_df
if
__name_
_
==
'__main__'
:
main
(
)
|
执行与结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
python
pandas_02_final
.
py
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
1. 获得商品数据字典
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
{
'102'
:
1
,
'103'
:
2
,
'101'
:
0
,
'106'
:
5
,
'107'
:
6
,
'104'
:
3
,
'105'
:
4
}
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
2. 获得用户数据字典
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
{
'user4'
:
3
,
'user5'
:
4
,
'user2'
:
1
,
'user3'
:
2
,
'user1'
:
0
}
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
3. 获得商品购买次数矩阵
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
[
[
5
3
4
4
2
2
1
]
[
3
3
3
2
1
1
0
]
[
4
3
4
3
1
2
0
]
[
4
2
3
4
2
2
1
]
[
2
1
1
2
2
1
1
]
[
2
1
2
2
1
2
0
]
[
1
0
0
1
1
0
1
]
]
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
4. 获得用户商品评分矩矩阵
(仅仅包含用户买过的商品
)
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
[
[
5.
2.
2.
5.
4.
]
[
3.
2.5
0.
0.
3.
]
[
2.5
5.
0.
3.
2.
]
[
0.
2.
4.
4.5
4.
]
[
0.
0.
4.5
0.
3.5
]
[
0.
0.
0.
4.
4.
]
[
0.
0.
5.
0.
0.
]
]
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
5. 两个矩阵相乘获得最终的
<每个用户每个商品评分矩阵
>
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
[
[
44.
45.5
40.
63.
68.
]
[
31.5
32.5
18.5
37.
42.5
]
[
39.
41.5
24.5
53.5
56.5
]
[
33.5
36.
38.
55.
59.
]
[
15.5
15.5
26.
26.
32.
]
[
18.
20.5
16.5
33.
34.5
]
[
5.
4.
15.5
9.5
11.5
]
]
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
6.
<每个用户每个商品评分矩阵
> 和
Pandas
,
goods_dict
,
user
_dict 结合获得最终的数据
===
===
===
===
===
===
===
===
===
===
===
===
===
===
===
=
user1
user2
user3
user4
user5
101
44.0
45.5
40.0
63.0
68.0
102
31.5
32.5
18.5
37.0
42.5
103
39.0
41.5
24.5
53.5
56.5
104
33.5
36.0
38.0
55.0
59.0
105
15.5
15.5
26.0
26.0
32.0
106
18.0
20.5
16.5
33.0
34.5
107
5.0
4.0
15.5
9.5
11.5
|
从最后的结果可以很容易的看出用户相关商品的得分是多少, 从而可以得到需要推荐的商品(剔除用户购买评分过的商品)。
昵称: HH
QQ: 275258836
ttlsa群交流沟通(QQ群②: 6690706 QQ群③: 168085569 QQ群④: 415230207(新) 微信公众号: ttlsacom)
感觉本文内容不错,读后有收获?
逛逛衣服店,鼓励作者写出更好文章。
收 藏
转载请注明:成长的对话 » 物品推荐-Pandas-Python数据分析(22)