To make a Hive table out of a JSON file you need to write the CREATE TABLE statement based on HiveQL DDL standards specifically for your JSON structure.
It can be very complicated if you are using a nested JSON file so I recommend you to use this quick and easy generator: https://hivetablegenerator.com/
Using HiveQL to analyse JSON files require either org.openx.data.jsonserde.JsonSerDe
or org.apache.hive.hcatalog.data.JsonSerDe
to work correctly.
org.apache.hive.hcatalog.data.JsonSerDe
This is the default JSON SerDe from Apache. This is commonly used to process JSON data like events. These events are represented as blocks of JSON-encoded text separated by a new line. The Hive JSON SerDe does not allow duplicate keys in map or struct key names.
org.openx.data.jsonserde.JsonSerDe
OpenX JSON SerDe is similar to native Apache; however, it offers multiple optional properties such as "ignore.malformed.json", "case.insensitive", and many more. In my opinion, it usually works better when dealing with nested JSON files.
Taking this sample complex JSON file:
{
"schemaVersion": "1.0",
"id": "07c1687a0fd34ebf8a42e8a8627321dc",
"accountId": "123456677",
"partition": "aws",
"region": "us-west-2",
"severity": {
"score": "0",
"description": "Informational"
},
"createdAt": "2021-02-27T18:57:07Z",
"resourcesAffected": {
"s3Bucket": {
"arn": "arn:aws:s3:::bucket-sample",
"name": "bucket-sample",
"createdAt": "2020-08-09T07:24:55Z",
"owner": {
"displayName": "account-name",
"id": "919a30c2f56c0b220c32e9234jnkj435n6jk4nk"
},
"tags": [],
"defaultServerSideEncryption": {
"encryptionType": "AES256"
},
"publicAccess": {
"permissionConfiguration": {
"bucketLevelPermissions": {
"accessControlList": {
"allowsPublicReadAccess": false,
"allowsPublicWriteAccess": false
},
"bucketPolicy": {
"allowsPublicReadAccess": true,
"allowsPublicWriteAccess": false
},
"blockPublicAccess": {
"ignorePublicAcls": false,
"restrictPublicBuckets": false,
"blockPublicAcls": false,
"blockPublicPolicy": false
}
},
"accountLevelPermissions": {
"blockPublicAccess": {
"ignorePublicAcls": false,
"restrictPublicBuckets": false,
"blockPublicAcls": false,
"blockPublicPolicy": false
}
}
},
"effectivePermission": "PUBLIC"
}
},
"s3Object": {
"bucketArn": "arn:aws:s3:::bucket-sample",
"key": "2021/01/17191133/Camping-Checklist-Google-Docs.pdf",
"path": "bucket-sample/2021/01/17191133/Camping-Checklist-Google-Docs.pdf",
"extension": "pdf",
"lastModified": "2021-01-17T22:11:34Z",
"eTag": "e8d990704042d2e1b7bb504fb5868095",
"versionId": "isqHLkSsQUMbbULNT2nMDneMG0zqitbD",
"serverSideEncryption": {
"encryptionType": "AES256"
},
"size": "150532",
"storageClass": "STANDARD",
"tags": [],
"publicAccess": true
}
},
"category": "CLASSIFICATION",
"classificationDetails": {
"jobArn": "arn:aws:macie2:us-west-2:123412341341:classification-job/d6cf41ccc7ea8daf3bd53ddcb86a2da5",
"result": {
"status": {
"code": "COMPLETE"
},
"sizeClassified": "150532",
"mimeType": "application/pdf",
"sensitiveData": []
},
"detailedResultsLocation": "s3://bucket-macie/AWSLogs/123412341341/Macie/us-west-2/d6cf41ccc7ea8daf3bd53ddcb86a2da5/123412341341/50de3137-9806-3e43-9b6e-a6158fdb0e3b.jsonl.gz",
"jobId": "d6cf41ccc7ea8daf3bd53ddcb86a2da5"
}
}
Would require the following create table statement:
CREATE EXTERNAL TABLE IF NOT EXISTS `macie`.`macie_bucket` (
`schemaVersion` STRING,
`id` STRING,
`accountId` STRING,
`partition` STRING,
`region` STRING,
`severity` STRUCT<
`score`:STRING,
`description`:STRING>,
`createdAt` STRING,
`resourcesAffected` STRUCT<
`s3Bucket`:STRUCT<
`arn`:STRING,
`name`:STRING,
`createdAt`:STRING,
`owner`:STRUCT<
`displayName`:STRING,
`id`:STRING>,
`defaultServerSideEncryption`:STRUCT<
`encryptionType`:STRING>,
`publicAccess`:STRUCT<
`permissionConfiguration`:STRUCT<
`bucketLevelPermissions`:STRUCT<
`accessControlList`:STRUCT<
`allowsPublicReadAccess`:BOOLEAN,
`allowsPublicWriteAccess`:BOOLEAN>,
`bucketPolicy`:STRUCT<
`allowsPublicReadAccess`:BOOLEAN,
`allowsPublicWriteAccess`:BOOLEAN>,
`blockPublicAccess`:STRUCT<
`ignorePublicAcls`:BOOLEAN,
`restrictPublicBuckets`:BOOLEAN,
`blockPublicAcls`:BOOLEAN,
`blockPublicPolicy`:BOOLEAN>>,
`accountLevelPermissions`:STRUCT<
`blockPublicAccess`:STRUCT<
`ignorePublicAcls`:BOOLEAN,
`restrictPublicBuckets`:BOOLEAN,
`blockPublicAcls`:BOOLEAN,
`blockPublicPolicy`:BOOLEAN>>>,
`effectivePermission`:STRING>>,
`s3Object`:STRUCT<
`bucketArn`:STRING,
`key`:STRING,
`path`:STRING,
`extension`:STRING,
`lastModified`:STRING,
`eTag`:STRING,
`versionId`:STRING,
`serverSideEncryption`:STRUCT<
`encryptionType`:STRING>,
`size`:STRING,
`storageClass`:STRING,
`publicAccess`:BOOLEAN>>,
`category` STRING,
`classificationDetails` STRUCT<
`jobArn`:STRING,
`result`:STRUCT<
`status`:STRUCT<
`code`:STRING>,
`sizeClassified`:STRING,
`mimeType`:STRING>,
`detailedResultsLocation`:STRING,
`jobId`:STRING>)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
LOCATION
's3://awsexamplebucket1-logs/AWSLogs/'
If you need more information from Amazon on how to create table out of nested JSON files for AWS Athena check out this link: https://aws.amazon.com/blogs/big-data/create-tables-in-amazon-athena-from-nested-json-and-mappings-using-jsonserde/