1.1. Pandas分析步骤
- 载入数据
- 将 浏览工具排名 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
|
SELECT
browser
,
count
(
*
)
FROM
log
GROUP
BY
browser
ORDER
BY
count
(
*
)
LIMIT
0
,
100
;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
cat
pd_ng_log_stat
.
py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from
ng_line_parser
import
NgLineParser
import
pandas
as
pd
import
socket
import
struct
class
PDNgLogStat
(
object
)
:
def
__init__
(
self
)
:
self
.
ng_line_parser
=
NgLineParser
(
)
def
_log_line_iter
(
self
,
pathes
)
:
""
"解析文件中的每一行并生成一个迭代器"
""
for
path
in
pathes
:
with
open
(
path
,
'r'
)
as
f
:
for
index
,
line
in
enumerate
(
f
)
:
self
.
ng_line_parser
.
parse
(
line
)
yield
self
.
ng_line_parser
.
to_dict
(
)
def
load_data
(
self
,
path
)
:
""
"通过给的文件路径加载数据生成 DataFrame"
""
self
.
df
=
pd
.
DataFrame
(
self
.
_log_line_iter
(
path
)
)
def
browser_stat
(
self
)
:
""
"统计不同浏览器访问次数"
""
group_by_cols
=
[
'browser'
]
# 需要分组的列,只计算和显示该列
# 直接统计次数
url_req_grp
=
self
.
df
[
group_by_cols
]
.
groupby
(
self
.
df
[
'browser'
]
)
return
url_req_grp
.
agg
(
[
'count'
]
)
[
'browser'
]
.
nlargest
(
100
,
'count'
)
def
main
(
)
:
file_pathes
=
[
'www.ttmark.com.access.log'
]
pd_ng_log_stat
=
PDNgLogStat
(
)
pd_ng_log_stat
.
load_data
(
file_pathes
)
# 统计 统计不同浏览器访问次数
print
pd_ng_log_stat
.
browser_stat
(
)
if
__name__
==
'__main__'
:
main
(
)
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python
pd_ng_log_stat
.
py
count
browser
Googlebot
/
2.1
;
+
http
:
//www.google.com/bot.html) 104533
Chrome
/
47.0.2526.106
larbin2
.
6.3
@
unspecified
.
mail
101013
bingbot
/
2.0
;
+
http
:
//www.bing.com/bingbot.htm) 57072
.
.
.
.
.
.
P1
4.1.2
)
613
Safari
/
534.30
OppoBrowser
/
3.9.2
610
9.3.2
;
zh_CN
)
601
[
100
rows
x
1
columns
]
|
昵称: HH
QQ: 275258836
ttlsa群交流沟通(QQ群②: 6690706 QQ群③: 168085569 QQ群④: 415230207(新) 微信公众号: ttlsacom)
感觉本文内容不错,读后有收获?
逛逛衣服店,鼓励作者写出更好文章。
收 藏
转载请注明:成长的对话 » 浏览工具排名-Pandas-Python数据分析(11)