1.1. Pandas分析步骤
- 载入日志数据
- 载入area_ip数据
- 将 CDN_IP请求数 进行 COUNT。类似如下SQL:
1
2
3
4
5
6
7
8
9
10
|
SELECT
inet_aton
(
l
.
cdn_ip
)
,
count
(
*
)
,
a
.
addr
FROM
log
AS
l
INNER
JOIN
area_ip
AS
a
ON
a
.
start_ip_num
<=
inet_aton
(
l
.
cdn_ip
)
AND
a
.
end_ip_num
>=
inet_aton
(
l
.
cdn_ip
)
GROUP
BY
cdn_ip
ORDER
BY
count
(
*
)
LIMIT
0
,
100
;
|
1.2. 代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
cat
pd_ng_log_stat
.
py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from
ng_line_parser
import
NgLineParser
import
pandas
as
pd
import
socket
import
struct
class
PDNgLogStat
(
object
)
:
def
__init__
(
self
)
:
self
.
ng_line_parser
=
NgLineParser
(
)
def
_log_line_iter
(
self
,
pathes
)
:
""
"解析文件中的每一行并生成一个迭代器"
""
for
path
in
pathes
:
with
open
(
path
,
'r'
)
as
f
:
for
index
,
line
in
enumerate
(
f
)
:
self
.
ng_line_parser
.
parse
(
line
)
yield
self
.
ng_line_parser
.
to_dict
(
)
def
_ip2num
(
self
,
ip
)
:
""
"用于IP转化为数字"
""
ip_num
=
-
1
try
:
# 将IP转化成INT/LONG 数字
ip_num
=
socket
.
ntohl
(
struct
.
unpack
(
"I"
,
socket
.
inet_aton
(
str
(
ip
)
)
)
[
0
]
)
except
:
pass
finally
:
return
ip_num
def
_get_addr_by_ip
(
self
,
ip
)
:
""
"通过给的IP获得地址"
""
ip_num
=
self
.
_ip2num
(
ip
)
try
:
addr_df
=
self
.
ip_addr_df
[
(
self
.
ip_addr_df
.
ip_start_num
<=
ip_num
)
&
(
ip_num
<=
self
.
ip_addr_df
.
ip_end_num
)
]
addr
=
addr_df
.
at
[
addr_df
.
index
.
tolist
(
)
[
0
]
,
'addr'
]
return
addr
except
:
return
None
def
load_data
(
self
,
path
)
:
""
"通过给的文件路径加载数据生成 DataFrame"
""
self
.
df
=
pd
.
DataFrame
(
self
.
_log_line_iter
(
path
)
)
def
uv_cdn_ip
(
self
,
top
=
100
)
:
""
"统计cdn ip量"
""
group_by_cols
=
[
'cdn_ip'
]
# 需要分组的列,只计算和显示该列
# 直接统计次数
url_req_grp
=
self
.
df
[
group_by_cols
]
.
groupby
(
self
.
df
[
'cdn_ip'
]
)
return
url_req_grp
.
agg
(
[
'count'
]
)
[
'cdn_ip'
]
.
nlargest
(
top
,
'count'
)
def
uv_cdn_ip_addr
(
self
,
top
=
100
)
:
""
"统计cdn ip量 地址"
""
cnt_df
=
self
.
uv_cdn_ip
(
top
)
# 添加 ip 地址 列
cnt_df
.
insert
(
len
(
cnt_df
.
columns
)
,
'addr'
,
cnt_df
.
index
.
map
(
self
.
_get_addr_by_ip
)
)
def
load_ip_addr
(
self
,
path
)
:
""
"加载IP"
""
cols
=
[
'id'
,
'ip_start_num'
,
'ip_end_num'
,
'ip_start'
,
'ip_end'
,
'addr'
,
'operator'
]
self
.
ip_addr_df
=
pd
.
read_csv
(
path
,
sep
=
'\t'
,
names
=
cols
,
index_col
=
'id'
)
return
self
.
ip_addr_df
def
main
(
)
:
file_pathes
=
[
'www.ttmark.com.access.log'
]
pd_ng_log_stat
=
PDNgLogStat
(
)
pd_ng_log_stat
.
load_data
(
file_pathes
)
# 加载 ip 地址
area_ip_path
=
'area_ip.csv'
pd_ng_log_stat
.
load_ip_addr
(
area_ip_path
)
# 统计 CDN IP 访问量 和 地址
print
pd_ng_log_stat
.
uv_cdn_ip_addr
(
)
if
__name__
==
'__main__'
:
main
(
)
|
运行统计和输出结果
1
2
3
4
5
6
7
8
9
10
11
12
13
|
python
pd_ng_log_stat
.
py
count
addr
cdn
_ip
140.205.127.2
31943
浙江省杭州市
101.200.101.203
26306
None
101.200.101.214
24667
None
.
.
.
.
.
.
140.205.253.155
4065
浙江省杭州市
140.205.253.174
4048
浙江省杭州市
140.205.253.131
3972
浙江省杭州市
[
100
rows
x
2
columns
]
|
昵称: HH
QQ: 275258836
ttlsa群交流沟通(QQ群②: 6690706 QQ群③: 168085569 QQ群④: 415230207(新) 微信公众号: ttlsacom)
感觉本文内容不错,读后有收获?
逛逛衣服店,鼓励作者写出更好文章。
收 藏