Athena使ってみたコマンド残し

Athena

テーブル作成

CREATE EXTERNAL TABLE IF NOT EXISTS elblogdb.dev1_web (
  request_timestamp string, 
  elb_name string, 
  request_ip string, 
  request_port int, 
  backend_ip string, 
  backend_port int, 
  request_processing_time double, 
  backend_processing_time double, 
  client_response_time double, 
  elb_response_code string, 
  backend_response_code string, 
  received_bytes bigint, 
  sent_bytes bigint, 
  request_verb string, 
  url string, 
  protocol string, 
  user_agent string, 
  ssl_cipher string, 
  ssl_protocol string ) 
PARTITIONED BY(year string, month string, day string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 
WITH SERDEPROPERTIES (
         'serialization.format' = '1','input.regex' = '([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*):([0-9]*) ([.0-9]*) ([.0-9]*) ([.0-9]*) (-|[0-9]*) (-|[0-9]*) ([-0-9]*) ([-0-9]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (\"[^\"]*\") ([A-Z0-9-]+) ([A-Za-z0-9.-]*)$' ) 
LOCATION 's3://XXXX-dev-elblogs/dev1/web/AWSLogs/XXXXXXX/elasticloadbalancing/ap-northeast-1/';

パーティション追加

ALTER TABLE elblogdb.dev1_web ADD PARTITION (year='2018',month='05',day='14') location 's3://XXXX-dev-elblogs/dev1/web/AWSLogs/XXXXXX/elasticloadbalancing/ap-northeast-1/2018/05/14/'

パーティション確認

show partitions elblogdb.dev1_web;

パーティションロード

MSCK REPAIR TABLE elblogdb.dev1_web

検索

SELECT "dev1_web"."backend_ip" AS "backend_ip",
  "dev1_web"."backend_port" AS "backend_port",
  "dev1_web"."backend_processing_time" AS "backend_processing",
  "dev1_web"."backend_response_code" AS "backend_response_c",
  "dev1_web"."client_response_time" AS "client_response_ti",
  "dev1_web"."day" AS "day",
  "dev1_web"."elb_name" AS "elb_name",
  "dev1_web"."elb_response_code" AS "elb_response_code",
  "dev1_web"."month" AS "month",
  "dev1_web"."protocol" AS "protocol",
  "dev1_web"."received_bytes" AS "received_bytes",
  "dev1_web"."request_ip" AS "request_ip",
  "dev1_web"."request_port" AS "request_port",
  "dev1_web"."request_processing_time" AS "request_processing",
  "dev1_web"."request_timestamp" AS "request_timestamp",
  "dev1_web"."request_verb" AS "request_verb",
  "dev1_web"."sent_bytes" AS "sent_bytes",
  "dev1_web"."ssl_cipher" AS "ssl_cipher",
  "dev1_web"."ssl_protocol" AS "ssl_protocol",
  "dev1_web"."url" AS "url",
  "dev1_web"."user_agent" AS "user_agent",
  "dev1_web"."year" AS "year"
FROM "elblogdb"."dev1_web" "dev1_web"
WHERE REGEXP_LIKE(LOWER("dev1_web"."url"), CONCAT('.*', '7241996369447818596', '.*'))
LIMIT 20

作業ログ

import boto3
import time
 
client = boto3.client('athena')
 
def lambda_handler(event, context):
    for record in event["Records"]:
        bucket = record['s3']['bucket']['name']
        table_name = "elblogdb.dev1_web"
        key = record['s3']['object']['key']
        key_tmp = key.split('/')
        year = key_tmp[len(key_tmp)-4]
        month = key_tmp[len(key_tmp)-3]
        day = key_tmp[len(key_tmp)-2]
        path = 'dev1/web/AWSLogs/XXXXXX/elasticloadbalancing/ap-northeast-1' + '/' + year + '/' + month + '/' + day + '/'
        sql = "ALTER TABLE {} ADD PARTITION (year=\'{}\',month=\'{}\',day=\'{}\') location \'{}\'".format \
            (table_name, year, month , day,'s3://' + bucket + '/' + path)
        print('sql=' + sql)
        client.start_query_execution(
            QueryString=sql,
            QueryExecutionContext={
                'Database': 'elblogdb'
            },
            ResultConfiguration={
                'OutputLocation': 's3://aws-athena-query-results-XXXXXX-ap-northeast-1'
            }
        )