1.1. Pandas分析步骤
- 载入数据
- 将 请求的URL 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
|
SELECT
request_url
,
count
(
*
)
FROM
log
GROUP
BY
request_url
ORDER
BY
count
(
*
)
LIMIT
0
,
100
;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
cat
pd_ng_log_stat
.
py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from
ng_line_parser
import
NgLineParser
import
pandas
as
pd
import
socket
import
struct
class
PDNgLogStat
(
object
)
:
def
__init__
(
self
)
:
self
.
ng_line_parser
=
NgLineParser
(
)
def
_log_line_iter
(
self
,
pathes
)
:
""
"解析文件中的每一行并生成一个迭代器"
""
for
path
in
pathes
:
with
open
(
path
,
'r'
)
as
f
:
for
index
,
line
in
enumerate
(
f
)
:
self
.
ng_line_parser
.
parse
(
line
)
yield
self
.
ng_line_parser
.
to_dict
(
)
def
load_data
(
self
,
path
)
:
""
"通过给的文件路径加载数据生成 DataFrame"
""
self
.
df
=
pd
.
DataFrame
(
self
.
_log_line_iter
(
path
)
)
def
url_req_stat
(
self
)
:
""
"统计那个页面点击量"
""
group_by_cols
=
[
'request_url'
]
# 需要分组的列,只计算和显示该列
# 直接统计次数
url_req_grp
=
self
.
df
[
group_by_cols
]
.
groupby
(
self
.
df
[
'request_url'
]
)
return
url_req_grp
.
agg
(
[
'count'
]
)
[
'request_url'
]
.
sort_values
(
by
=
'count'
,
ascending
=
False
)
def
main
(
)
:
file_pathes
=
[
'www.ttmark.com.access.log'
]
pd_ng_log_stat
=
PDNgLogStat
(
)
pd_ng_log_stat
.
load_data
(
file_pathes
)
# 统计页面点击量
print
pd_ng_log_stat
.
url_req_stat
(
)
if
__name__
==
'__main__'
:
main
(
)
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python
pd_ng_log_stat
.
py
count
request_url
/
wp
-
admin
/
admin
-
ajax
.
php
246361
/
tag
/
126012
/
57325
.
.
.
.
.
.
/
chufang
/
2016
/
06
/
25
/
8634.html
2312
/
chufang
/
2015
/
03
/
26
/
4686.html
2293
/
jiaju
/
2014
/
12
/
05
/
1348.html
2230
[
29205
rows
x
1
columns
]
|
昵称: HH
QQ: 275258836
ttlsa群交流沟通(QQ群②: 6690706 QQ群③: 168085569 QQ群④: 415230207(新) 微信公众号: ttlsacom)
感觉本文内容不错,读后有收获?
逛逛衣服店,鼓励作者写出更好文章。
收 藏
转载请注明:成长的对话 » URL请求数-Pandas-Python数据分析(7)