2 분 소요

Using function to clean json data

from google.cloud import bigquery

location = "asia-northeast3"
my_account = "# rho715"

def runQuery_DF(query):
    final_query = "\n".join([my_account, query]) #add my account info to query input

    client = bigquery.Client(location=location)
    data = client.query(final_query).to_dataframe() #run query
    client.close()

    print('\nquery : ' , final_query)
    return data

query = '''
CREATE TEMP FUNCTION del_json_field(json_str STRING, key_name STRING)
RETURNS STRING
LANGUAGE js
AS r"""
  json_obj = JSON.parse(json_str)
  delete json_obj[key_name]
  return JSON.stringify(json_obj)
""";

select 
  '{"appVer":"5.0.2","carrier":"none","country":"KR","destIp":"125.132.54.23","deviceName":"Box"}' STR,
  del_json_field('{"appVer":"5.0.2","carrier":"none","country":"KR","destIp":"125.132.54.23","deviceName":"Box"}', "destIp")
;
'''
import pandas as pd
pd.set_option('display.max_colwidth', None)
runQuery_DF(query)

STR f0_
0 {"appVer":"5.0.2","carrier":"none","country":"KR","destIp":"125.132.54.23","deviceName":"Box"} {"appVer":"5.0.2","carrier":"none","country":"KR","deviceName":"Box"}

Using json_extract_scalar & json_extract

from google.cloud import bigquery

location = "asia-northeast3"
my_account = "# rho715"

def runQuery_DF(query):
    final_query = "\n".join([my_account, query]) #add my account info to query input

    client = bigquery.Client(location=location)
    data = client.query(final_query).to_dataframe() #run query
    client.close()

    print('\nquery : ' , final_query)
    return data

query = '''
with data_sample as (
  select
  [struct('{"action_item":"sns_share","action_param":{"optional":{"share_type":"facebook"},"required":{"id":"01_0001.1","type":"vod"}},"action_type":"button_click","current":"content_detail"}' as json_data, current_date as yyyy_mm_dd),
  struct('{"action_item":"sns_share","action_param":{"optional":{"share_type":"facebook"},"required":{"id":"01_0001.2","type":"qvod"}},"action_type":"button_click","current":"content_detail"}' as json_data, current_date as yyyy_mm_dd)
  ] as struct_sample
)
select yyyy_mm_dd
    , json_extract_scalar(json_data, '$.action_item') `action_item` --json key
    , json_extract(json_data, '$.action_param') `action_param` --json value
from data_sample, unnest(struct_sample) 
'''
import pandas as pd
pd.set_option('display.max_colwidth', None)
runQuery_DF(query)
yyyy_mm_dd action_item action_param
0 2023-01-28 sns_share {"optional":{"share_type":"facebook"},"required":{"id":"01_0001.1","type":"vod"}}
1 2023-01-28 sns_share {"optional":{"share_type":"facebook"},"required":{"id":"01_0001.2","type":"qvod"}}

Using JSON_QUERY_ARRAY & UNNEST()

from google.cloud import bigquery

location = "asia-northeast3"
my_account = "# rho715"

def runQuery_DF(query):
    final_query = "\n".join([my_account, query]) #add my account info to query input

    client = bigquery.Client(location=location)
    data = client.query(final_query).to_dataframe() #run query
    client.close()

    print('\nquery : ' , final_query)
    return data

query = '''
with data_sample as (
  select
  [struct('{"agent":"SM-N986N","app_version":"none","data":{"action_type":"register","action_value":[{"content_id":"2101","content_type":"live"}]},"device":"android","log_timestamp":"2022-11-11 03:59:59","log_type":"","log_version":"1.2.2","profileid":"0","service":"cholo","uno":"abved","zone":"none"}' as data_json, current_timestamp() as created_at),
  struct('{"agent":"iPhone12,3","app_version":"none","data":{"action_type":"delete_all","action_value":[{"content_id":"0079","content_type":"movie"},{"content_id":"EN394","content_type":"theme"},{"content_id":"EN394","content_type":"theme"}]},"device":"ios","log_timestamp":"2022-11-10 23:58:05","log_type":"","log_version":"1.2.2","profileid":"0","service":"cholo","uno":"zdfoe","zone":"none"}' as data_json, current_timestamp() as created_at),
  struct('{"agent":"iPhone12,3","app_version":"none","data":{"action_type":"delete","action_value":[{"content_id":"0063","content_type":"program"}]},"device":"ios","log_timestamp":"2022-11-10 23:58:05","log_type":"","log_version":"1.2.2","profileid":"0","service":"cholo","uno":"jojojo","zone":"none"}' as data_json, current_timestamp() as created_at)
  ] as struct_sample
)
, struct_to_rows as (
select created_at
    , data_json 
from data_sample, unnest(struct_sample) 
)
, unnest_action_value as (
  SELECT
    JSON_QUERY_ARRAY(data_json, "$.data.action_value") `action_value_array` --array
    , created_at
    , data_json data
  FROM struct_to_rows
  WHERE
  json_extract_scalar(data_json, '$.uno') != 'test'
  and json_extract_scalar(data_json, '$.log_type') != '1.1.1'
)
select
     -- common log
     json_extract_scalar(data, '$.log_type') `log_type`  
     , json_extract_scalar(data, '$.log_timestamp') `log_timestamp`
     , TIMESTAMP_ADD(TIMESTAMP(created_at), INTERVAL 9 HOUR) `ap_timestamp` 
     , json_extract_scalar(data, '$.uno') `uno`
     , json_extract_scalar(data, '$.profileid') `profile_id`
     , json_extract_scalar(data, '$.zone') `zone`
     , json_extract_scalar(data, '$.device') `device_type`
     , json_extract_scalar(data, '$.agent') `agent`
     , json_extract_scalar(data, '$.app_version') `app_version`

     , json_extract_scalar(data, '$.data.action_type') `action_type`
     --action_value
     , json_extract_scalar(action_value, '$.content_type') `content_type`
     , json_extract_scalar(action_value, '$.content_id') `content_id`

from unnest_action_value, unnest(unnest_action_value.action_value_array) as action_value
'''
import pandas as pd
pd.set_option('display.max_colwidth', None)
runQuery_DF(query)
log_type log_timestamp ap_timestamp uno profile_id zone device_type agent app_version action_type content_type content_id
0 2022-11-11 03:59:59 2023-01-28 20:51:57.011544+00:00 abved 0 none android SM-N986N none register live 2101
1 2022-11-10 23:58:05 2023-01-28 20:51:57.011544+00:00 zdfoe 0 none ios iPhone12,3 none delete_all movie 0079
2 2022-11-10 23:58:05 2023-01-28 20:51:57.011544+00:00 zdfoe 0 none ios iPhone12,3 none delete_all theme EN394
3 2022-11-10 23:58:05 2023-01-28 20:51:57.011544+00:00 zdfoe 0 none ios iPhone12,3 none delete_all theme EN394
4 2022-11-10 23:58:05 2023-01-28 20:51:57.011544+00:00 jojojo 0 none ios iPhone12,3 none delete program 0063

댓글남기기