! apache log parser https://qiita.com/shotakaha/items/05287cd625176945322a プログラム(delegated 対応) import apache_log_parser from pprint import pprint import pandas as pd def read_apache_log(ifn, logformat='%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'): logformat = "%h %l %u %t \"%r\" %>s %b" # delegated のパラメータ test_data ='xxx.xxx.xxx.xxx - - [18/Feb/2019:23:58:36 +0900] "GET /ja/index.html HTTP/1.1" 301 240 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' parser = apache_log_parser.make_parser(logformat) P = [] E = [] # with open(ifn) as f: with open(ifn,'r', encoding='utf-8', errors='ignore') as f: for line in f: try: parsed_line = parser(line) P.append(parsed_line) except ValueError: E.append(line) except apache_log_parser.LineDoesntMatchException as ex: print ("E") pass pprint('=== Read Summary ===') pprint('Parsed : {0}'.format(len(P))) pprint('ValueError : {0}'.format(len(E))) pprint('====================') return P # ifn = 'access_log' ## apache のログファイル ifn = '18080.http' ## delegated のログファイル P = read_apache_log(ifn) df = pd.DataFrame(P) print('dataframeの行数・列数の確認==>\n', df.shape) print('indexの確認==>\n', df.index) print('columnの確認==>\n', df.columns) print('dataframeの各列のデータ型を確認==>\n', df.dtypes) # df.to_csv('output.csv') # 必要項目 抽出 df1 = df [[ 'time_received_datetimeobj','request_url_hostname','status','response_bytes_clf' ]] # df1['byte']=df1['response_bytes_clf'].astype(int) df1['byte']=pd.to_numeric(df1['response_bytes_clf'],errors='coerce') dh1a = pd.DataFrame(df1.groupby(['request_url_hostname'])['byte'].sum()) # df1.to_csv('output.csv') dh2a = pd.DataFrame(df1['request_url_hostname'].value_counts()) # dh2 = pd.concat([dh2a,dh1a],axis=1) print (dh2) dh2.to_csv('output.csv')