1.1. Pandas分析步骤
- 载入数据
- 将 外链点击数 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
|
SELECT
reference_url
,
count
(
*
)
FROM
log
GROUP
BY
reference_url
ORDER
BY
count
(
*
)
LIMIT
0
,
100
;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
cat
pd_ng_log_stat
.
py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from
ng_line_parser
import
NgLineParser
import
pandas
as
pd
import
socket
import
struct
class
PDNgLogStat
(
object
)
:
def
__init__
(
self
)
:
self
.
ng_line_parser
=
NgLineParser
(
)
def
_log_line_iter
(
self
,
pathes
)
:
""
"解析文件中的每一行并生成一个迭代器"
""
for
path
in
pathes
:
with
open
(
path
,
'r'
)
as
f
:
for
index
,
line
in
enumerate
(
f
)
:
self
.
ng_line_parser
.
parse
(
line
)
yield
self
.
ng_line_parser
.
to_dict
(
)
def
load_data
(
self
,
path
)
:
""
"通过给的文件路径加载数据生成 DataFrame"
""
self
.
df
=
pd
.
DataFrame
(
self
.
_log_line_iter
(
path
)
)
def
url_ref_stat
(
self
)
:
""
"统计外链点击情况"
""
group_by_cols
=
[
'reference_url'
]
# 需要分组的列,只计算和显示该列
# 直接统计次数
url_ref_grp
=
self
.
df
[
group_by_cols
]
.
groupby
(
self
.
df
[
'reference_url'
]
)
return
url_ref_grp
.
agg
(
[
'count'
]
)
[
'reference_url'
]
.
sort_values
(
by
=
'count'
,
ascending
=
False
)
def
main
(
)
:
file_pathes
=
[
'www.ttmark.com.access.log'
]
pd_ng_log_stat
=
PDNgLogStat
(
)
pd_ng_log_stat
.
load_data
(
file_pathes
)
# 统计外链点击情况
print
pd_ng_log_stat
.
url_ref_stat
(
)
if
__name__
==
'__main__'
:
main
(
)
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python
pd_ng_log_stat
.
py
count
reference_url
-
574546
www
.
ttmark
.
com
331136
m
.
baidu
.
com
32990
.
.
.
.
.
.
www
.
google
.
fr
192
www
.
google
.
de
147
www
.
google
.
it
136
[
231
rows
x
1
columns
]
|
昵称: HH
QQ: 275258836
ttlsa群交流沟通(QQ群②: 6690706 QQ群③: 168085569 QQ群④: 415230207(新) 微信公众号: ttlsacom)
感觉本文内容不错,读后有收获?
逛逛衣服店,鼓励作者写出更好文章。
收 藏
转载请注明:成长的对话 » 外链点击数-Pandas-Python数据分析(9)