トップ 差分 一覧 ソース 検索 ヘルプ RSS ログイン

PRG-PY-apache_log_parser

apache log parser

https://qiita.com/shotakaha/items/05287cd625176945322a

プログラム(delegated 対応)

import apache_log_parser
from pprint import pprint
import pandas as pd

def read_apache_log(ifn, logformat='%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'):
   logformat = "%h %l %u %t \"%r\" %>s %b"  # delegated のパラメータ
   test_data ='xxx.xxx.xxx.xxx - - [18/Feb/2019:23:58:36 +0900] "GET /ja/index.html HTTP/1.1" 301 240 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

   parser = apache_log_parser.make_parser(logformat)
   P = []
   E = []
   # with open(ifn) as f:
   with open(ifn,'r', encoding='utf-8', errors='ignore') as f:
       for line in f:
           try:
               parsed_line = parser(line)
               P.append(parsed_line)
           except ValueError:
               E.append(line)
           except apache_log_parser.LineDoesntMatchException as ex:
               print ("E")
               pass

   pprint('=== Read Summary ===')
   pprint('Parsed     : {0}'.format(len(P)))
   pprint('ValueError : {0}'.format(len(E)))
   pprint('====================')

   return P

# ifn = 'access_log'  ## apache のログファイル
ifn = '18080.http'  ## delegated のログファイル
P = read_apache_log(ifn)
df = pd.DataFrame(P)
print('dataframeの行数・列数の確認==>\n', df.shape)
print('indexの確認==>\n', df.index)
print('columnの確認==>\n', df.columns)
print('dataframeの各列のデータ型を確認==>\n', df.dtypes)

# df.to_csv('output.csv')
# 必要項目 抽出
df1 = df [[ 'time_received_datetimeobj','request_url_hostname','status','response_bytes_clf' ]] 
# df1['byte']=df1['response_bytes_clf'].astype(int)
df1['byte']=pd.to_numeric(df1['response_bytes_clf'],errors='coerce')
dh1a = pd.DataFrame(df1.groupby(['request_url_hostname'])['byte'].sum())
# df1.to_csv('output.csv')
dh2a = pd.DataFrame(df1['request_url_hostname'].value_counts())
#
dh2 = pd.concat([dh2a,dh1a],axis=1)
print (dh2)
dh2.to_csv('output.csv')