我正在尝试在 Spark (pyspark) 环境中使用 JSON 文件。
问题:无法将 Pyspark Dataframe 中的 JSON 转换为预期格式
第一个输入数据集:
https://health.data.ny.gov/api/views/cnih-y5dw/rows.json https://health.data.ny.gov/api/views/cnih-y5dw/rows.json
在此文件中,元数据在文件的开头定义为带有标签“meta”,然后是带有标签“data”的数据。
仅供参考:将数据从网络下载到本地驱动器所采取的步骤。 1. 我已将文件下载到本地驱动器 2. 然后推送到 hdfs - 从那里我将其读取到 Spark 环境。
df=sqlContext.read.json("/user/train/ny.json",multiLine=True)
df.count()
out[5]: 1
df.show()
df.printSchema()
root
|-- data: array (nullable = true)
| |-- element: array (containsNull = true)
| | |-- element: string (containsNull = true)
|-- meta: struct (nullable = true)
| |-- view: struct (nullable = true)
| | |-- attribution: string (nullable = true)
| | |-- attributionLink: string (nullable = true)
| | |-- averageRating: long (nullable = true)
| | |-- category: string (nullable = true)
| | |-- columns: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- cachedContents: struct (nullable = true)
| | | | | |-- average: string (nullable = true)
| | | | | |-- largest: string (nullable = true)
| | | | | |-- non_null: long (nullable = true)
| | | | | |-- null: long (nullable = true)
| | | | | |-- smallest: string (nullable = true)
| | | | | |-- sum: string (nullable = true)
| | | | | |-- top: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- count: long (nullable = true)
| | | | | | | |-- item: string (nullable = true)
| | | | |-- dataTypeName: string (nullable = true)
| | | | |-- description: string (nullable = true)
| | | | |-- fieldName: string (nullable = true)
| | | | |-- flags: array (nullable = true)
| | | | | |-- element: string (containsNull = true)
| | | | |-- format: struct (nullable = true)
| | | | | |-- align: string (nullable = true)
| | | | | |-- mask: string (nullable = true)
| | | | | |-- noCommas: string (nullable = true)
| | | | | |-- precisionStyle: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- name: string (nullable = true)
| | | | |-- position: long (nullable = true)
| | | | |-- renderTypeName: string (nullable = true)
| | | | |-- tableColumnId: long (nullable = true)
| | | | |-- width: long (nullable = true)
| | |-- createdAt: long (nullable = true)
| | |-- description: string (nullable = true)
| | |-- displayType: string (nullable = true)
| | |-- downloadCount: long (nullable = true)
| | |-- flags: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- grants: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- flags: array (nullable = true)
| | | | | |-- element: string (containsNull = true)
| | | | |-- inherited: boolean (nullable = true)
| | | | |-- type: string (nullable = true)
| | |-- hideFromCatalog: boolean (nullable = true)
| | |-- hideFromDataJson: boolean (nullable = true)
| | |-- id: string (nullable = true)
| | |-- indexUpdatedAt: long (nullable = true)
| | |-- metadata: struct (nullable = true)
| | | |-- attachments: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- assetId: string (nullable = true)
| | | | | |-- blobId: string (nullable = true)
| | | | | |-- filename: string (nullable = true)
| | | | | |-- name: string (nullable = true)
| | | |-- availableDisplayTypes: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- custom_fields: struct (nullable = true)
| | | | |-- Additional Resources: struct (nullable = true)
| | | | | |-- See Also: string (nullable = true)
| | | | |-- Dataset Information: struct (nullable = true)
| | | | | |-- Agency: string (nullable = true)
| | | | |-- Dataset Summary: struct (nullable = true)
| | | | | |-- Contact Information: string (nullable = true)
| | | | | |-- Coverage: string (nullable = true)
| | | | | |-- Data Frequency: string (nullable = true)
| | | | | |-- Dataset Owner: string (nullable = true)
| | | | | |-- Granularity: string (nullable = true)
| | | | | |-- Organization: string (nullable = true)
| | | | | |-- Posting Frequency: string (nullable = true)
| | | | | |-- Time Period: string (nullable = true)
| | | | | |-- Units: string (nullable = true)
| | | | |-- Disclaimers: struct (nullable = true)
| | | | | |-- Limitations: string (nullable = true)
| | | | |-- Local Data: struct (nullable = true)
| | | | | |-- County Filter: string (nullable = true)
| | | | | |-- County_Column: string (nullable = true)
| | | |-- filterCondition: struct (nullable = true)
| | | | |-- children: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- metadata: struct (nullable = true)
| | | | | | | |-- includeAuto: long (nullable = true)
| | | | | | | |-- multiSelect: boolean (nullable = true)
| | | | | | | |-- operator: string (nullable = true)
| | | | | | | |-- tableColumnId: struct (nullable = true)
| | | | | | | | |-- 583607: long (nullable = true)
| | | | | | |-- type: string (nullable = true)
| | | | | | |-- value: string (nullable = true)
| | | | |-- metadata: struct (nullable = true)
| | | | | |-- advanced: boolean (nullable = true)
| | | | | |-- unifiedVersion: long (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- value: string (nullable = true)
| | | |-- jsonQuery: struct (nullable = true)
| | | | |-- order: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- ascending: boolean (nullable = true)
| | | | | | |-- columnFieldName: string (nullable = true)
| | | |-- rdfSubject: string (nullable = true)
| | | |-- renderTypeConfig: struct (nullable = true)
| | | | |-- visible: struct (nullable = true)
| | | | | |-- table: boolean (nullable = true)
| | | |-- rowLabel: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- newBackend: boolean (nullable = true)
| | |-- numberOfComments: long (nullable = true)
| | |-- oid: long (nullable = true)
| | |-- owner: struct (nullable = true)
| | | |-- displayName: string (nullable = true)
| | | |-- flags: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- id: string (nullable = true)
| | | |-- profileImageUrlLarge: string (nullable = true)
| | | |-- profileImageUrlMedium: string (nullable = true)
| | | |-- profileImageUrlSmall: string (nullable = true)
| | | |-- screenName: string (nullable = true)
| | | |-- type: string (nullable = true)
| | |-- provenance: string (nullable = true)
| | |-- publicationAppendEnabled: boolean (nullable = true)
| | |-- publicationDate: long (nullable = true)
| | |-- publicationGroup: long (nullable = true)
| | |-- publicationStage: string (nullable = true)
| | |-- query: struct (nullable = true)
| | | |-- orderBys: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- ascending: boolean (nullable = true)
| | | | | |-- expression: struct (nullable = true)
| | | | | | |-- columnId: long (nullable = true)
| | | | | | |-- type: string (nullable = true)
| | |-- rights: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- rowsUpdatedAt: long (nullable = true)
| | |-- rowsUpdatedBy: string (nullable = true)
| | |-- tableAuthor: struct (nullable = true)
| | | |-- displayName: string (nullable = true)
| | | |-- flags: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- id: string (nullable = true)
| | | |-- profileImageUrlLarge: string (nullable = true)
| | | |-- profileImageUrlMedium: string (nullable = true)
| | | |-- profileImageUrlSmall: string (nullable = true)
| | | |-- screenName: string (nullable = true)
| | | |-- type: string (nullable = true)
| | |-- tableId: long (nullable = true)
| | |-- tags: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- totalTimesRated: long (nullable = true)
| | |-- viewCount: long (nullable = true)
| | |-- viewLastModified: long (nullable = true)
| | |-- viewType: string (nullable = true)
Problem:所有记录都包含在单行和两列中,即元和数据。另外,使用 Spark 本机 JSON 实用程序 - Spark 自动推断架构(元数据) - 我的期望是它不应该明确作为数据帧上的单独列。
预期输出JSON 数据集具有以下列列表。它应该在数据框中以表格格式显示它们,我可以在其中查询它们”
FACILITY, ADDRESS, LAST INSPECTED, VIOLATIONS,TOTAL CRITICAL VIOLATIONS, TOTAL CRIT. NOT CORRECTED, TOTAL NONCRITICAL VIOLATIONS, DESCRIPTION, LOCAL HEALTH DEPARTMENT, COUNTY, FACILITY ADDRESS, CITY, ZIP CODE, NYSDOH GAZETTEER (1980), MUNICIPALITY, OPERATION NAME, PERMIT EXPIRATION DATE, PERMITTED (D/B/A), PERMITTED CORP. NAME,PERM. OPERATOR LAST NAME, PERM. OPERATOR LAST NAME, PERM. OPERATOR FIRST NAME, NYS HEALTH OPERATION ID, INSPECTION TYPE, INSPECTION COMMENTS, FOOD SERVICE FACILITY STATE, Location1
第二个输入数据集:现场,这是世界银行资助项目的第一个数据集
http://jsonstudio.com/resources/ http://jsonstudio.com/resources/
(现场,这是世界银行资助项目的第一个数据集)
一切正常。
df=sqlContext.read.json("/user/train/wb.json")
df.count()
500
第二输入数据集可以正常工作,但第一输入数据集则不能。我的观察是两个 Json 文件定义元数据的方式不同。在第一。首先定义元数据,然后定义数据,但是在第二个文件中 - 肉数据可用于每行数据。
您能否指导我了解第一个输入 JSON 文件格式以及如何在将其转换为 pyspark 数据帧时处理情况?
更新结果:经过初步分析,我们发现格式似乎错误,但社区成员提供了另一种读取格式的方法。将答案标记为正确并关闭此线程。
如果您需要更多详细信息,请告诉我,提前致谢。