随着 2020 年 BigQuery 脚本引入动态 SQL,这个问题变得容易一些。现在,我们可以动态构建查询并通过EXECUTE IMMEDIATE
.
对于所有数据集都在的大多数情况,类似的事情会做region-us
:
DECLARE dataset_names ARRAY<STRING>;
SET dataset_names = (
SELECT ARRAY_AGG(SCHEMA_NAME) FROM `region-us.INFORMATION_SCHEMA.SCHEMATA`
);
EXECUTE IMMEDIATE (
SELECT STRING_AGG(
(SELECT """
SELECT project_id, dataset_id, table_id, row_count, size_bytes
FROM `""" || s ||
""".__TABLES__`"""),
" UNION ALL ")
FROM UNNEST(dataset_names) AS s);
如果存在大量数据集,则在尝试同时读取所有元数据时可能会返回速率限制错误。
如果发生这种情况,那么我们可以依靠“批处理”方法,这种方法读取起来有点复杂,速度较慢/效率较低,但仍然可以完成工作:
DECLARE dataset_names ARRAY<STRING>;
DECLARE batch ARRAY<STRING>;
DECLARE batch_size INT64 DEFAULT 25;
CREATE TEMP TABLE results (
project_id STRING,
dataset_id STRING,
table_id STRING,
row_count INT64,
size_bytes INT64
);
SET dataset_names = (
SELECT ARRAY_AGG(SCHEMA_NAME)
FROM `region-us.INFORMATION_SCHEMA.SCHEMATA`
);
LOOP
IF ARRAY_LENGTH(dataset_names) < 1 THEN
LEAVE;
END IF;
SET batch = (
SELECT ARRAY_AGG(d)
FROM UNNEST(dataset_names) AS d WITH OFFSET i
WHERE i < batch_size);
EXECUTE IMMEDIATE (
SELECT """INSERT INTO results """
|| STRING_AGG(
(SELECT """
SELECT project_id, dataset_id, table_id, row_count, size_bytes
FROM `""" || s || """.__TABLES__`"""),
" UNION ALL ")
FROM UNNEST(batch) AS s);
SET dataset_names = (
SELECT ARRAY_AGG(d)
FROM UNNEST(dataset_names) AS d
WHERE d NOT IN (SELECT * FROM UNNEST(batch)));
END LOOP;
SELECT * FROM results;