- 追加された行はこのように表示されます。
- 削除された行は
このように表示されます。
! apache log parser
https://qiita.com/shotakaha/items/05287cd625176945322a
プログラム(delegated 対応)
import apache_log_parser
from pprint import pprint
import pandas as pd
def read_apache_log(ifn, logformat='%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'):
logformat = "%h %l %u %t \"%r\" %>s %b" # delegated のパラメータ
test_data ='xxx.xxx.xxx.xxx - - [18/Feb/2019:23:58:36 +0900] "GET /ja/index.html HTTP/1.1" 301 240 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
parser = apache_log_parser.make_parser(logformat)
P = []
E = []
# with open(ifn) as f:
with open(ifn,'r', encoding='utf-8', errors='ignore') as f:
for line in f:
try:
parsed_line = parser(line)
P.append(parsed_line)
except ValueError:
E.append(line)
except apache_log_parser.LineDoesntMatchException as ex:
print ("E")
pass
pprint('=== Read Summary ===')
pprint('Parsed : {0}'.format(len(P)))
pprint('ValueError : {0}'.format(len(E)))
pprint('====================')
return P
# ifn = 'access_log' ## apache のログファイル
ifn = '18080.http' ## delegated のログファイル
P = read_apache_log(ifn)
df = pd.DataFrame(P)
print('dataframeの行数・列数の確認==>\n', df.shape)
print('indexの確認==>\n', df.index)
print('columnの確認==>\n', df.columns)
print('dataframeの各列のデータ型を確認==>\n', df.dtypes)
# df.to_csv('output.csv')
# 必要項目 抽出
df1 = df [[ 'time_received_datetimeobj','request_url_hostname','status','response_bytes_clf' ]]
df1['byte']=df1['response_bytes_clf'].astype(int)
# df1['byte']=df1['response_bytes_clf'].astype(int)
df1['byte']=pd.to_numeric(df1['response_bytes_clf'],errors='coerce')
dh1a = pd.DataFrame(df1.groupby(['request_url_hostname'])['byte'].sum())
# df1.to_csv('output.csv')
dh2a = pd.DataFrame(df1['request_url_hostname'].value_counts())
#
dh2 = pd.concat([dh2a,dh1a],axis=1)
print (dh2)
dh2.to_csv('output.csv')