编辑:我在 Python 中看到与 PHP 相同的行为。好像和MySQL有关。
我们正在尝试从 MySQL 5.7 升级到 8.0。我们的代码库使用 PHP MySQLi 来查询 MySQL 服务器。在我们的测试设置中,我们发现绑定大量参数的某些查询的性能较差(慢 50 倍)。我们希望看到 MySQL 8.0 的运行时间与 5.7 相似。下面是示例表结构和故障查询。
CREATE TABLE IF NOT EXISTS `a` (
`id` int NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL,
PRIMARY KEY (`id`) USING BTREE,
UNIQUE KEY `name` (`name`) USING BTREE,
KEY `name_id` (`id`,`name`) USING BTREE
);
CREATE TABLE IF NOT EXISTS `b` (
`id` int NOT NULL AUTO_INCREMENT,
`a_id` int NOT NULL,
`value` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
PRIMARY KEY (`id`) USING BTREE,
UNIQUE KEY `uniquevalue` (`a_id`,`value`) USING BTREE,
KEY `a_id` (`a_id`) USING BTREE,
KEY `v` (`value`) USING BTREE,
CONSTRAINT `b_ibfk_1` FOREIGN KEY (`a_id`) REFERENCES `a` (`id`) ON DELETE RESTRICT ON UPDATE RESTRICT
);
CREATE TABLE IF NOT EXISTS `c` (
`product` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`b_id` int NOT NULL,
PRIMARY KEY (`product`,`b_id`) USING BTREE,
KEY `b_id` (`b_id`),
KEY `product` (`product`),
CONSTRAINT `c_ibfk_2` FOREIGN KEY (`b_id`) REFERENCES `b` (`id`) ON DELETE RESTRICT ON UPDATE RESTRICT
);
-- example trouble query
SELECT c.product, a.name, b.value
FROM b
INNER JOIN a ON b.a_id = a.id AND a.name IN ('1be6f9eb563f3bf85c78b4219bf09de9')
-- this hash is from the dataset (linked below) but it should match a record in the 'a' table that has an associated record in the 'b' table that in turn has an associated record in the 'c' table
INNER JOIN c on c.b_id = b.id and c.product IN (?, ?, ?...) -- ... meaning dynamic number of parameters
如果将查询修改为只返回一条记录(限制1),查询仍然很慢。所以这与返回的数据量无关。如果查询以非参数化方式运行(使用字符串连接),则查询运行时间在所有环境中都是可接受的。添加的参数越多,查询速度就越慢(线性)。对于 7,000 个绑定参数,查询在 MySQL 5.7 中运行时间为 100 - 150 毫秒,在 MySQL 8.0.28 中运行时间约为 10 秒。我们在 PHP 7.4 和 8.0 中看到相同的结果。我们使用 MySQLi 或 PDO 看到相同的结果。
这告诉我这与参数绑定有关。我启用了分析并检查了查询结果。查询的大部分时间 (~95%) 花费在执行步骤,而不是参数绑定步骤。另外,我看到 mysql 8 进程 CPU 在查询运行时被固定。我对这个很困惑。
这里是MySQL 8.0的解释。
id |
select_type |
table |
partitions |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
1 |
SIMPLE |
a |
|
const |
PRIMARY,name,name_id |
name |
1022 |
const |
1 |
100 |
Using index |
1 |
SIMPLE |
c |
|
ref |
PRIMARY,b_id,product |
product |
152 |
const |
1 |
100 |
Using index |
1 |
SIMPLE |
b |
|
eq_ref |
PRIMARY,uniquevalue,a_id |
PRIMARY |
4 |
DefaultWeb.c.b_id |
1 |
5 |
Using where |
这里是MySQL 5.7的解释。
id |
select_type |
table |
partitions |
type |
possible_keys |
key |
key_len |
ref |
rows |
filtered |
Extra |
1 |
SIMPLE |
a |
|
const |
PRIMARY,name,name_id |
name |
257 |
const |
1 |
100 |
Using index |
1 |
SIMPLE |
c |
|
ref |
PRIMARY,b_id,product |
PRIMARY |
152 |
const |
1 |
100 |
Using index |
1 |
SIMPLE |
b |
|
eq_ref |
PRIMARY,uniquevalue,a_id |
PRIMARY |
4 |
DefaultWeb.c.b_id |
1 |
5 |
Using where |
这两种解释之间存在一些差异,但同样,这个问题仅发生在 PHP 中的准备好的语句中。
下面是一些演示该问题的 php 代码。编写此代码是为了针对我在下面的 Google Drive 链接中提供的数据集。我还将 MySQL 变量包含在 CSV 中。
<?php
// Modify these to fit your DB connection.
const HOST = '127.0.0.1';
const USER = 'root';
const PASS = 'localtest';
const DB_NAME = 'TestDatabase';
// As the number of parameters increases, time increases linearly.
// We're seeing ~10 seconds with 7000 params with this data.
const NUM_PARAMS = 7000;
function rand_string($length = 10) {
$characters = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
$charactersLength = strlen($characters);
$randomString = '';
for ($i = 0; $i < $length; $i++) {
$randomString .= $characters[rand(0, $charactersLength - 1)];
}
return $randomString;
}
function sql_question_marks($count, $sets = 1) {
return substr(str_repeat(",(".substr(str_repeat(",?", $count), 1).")", $sets), 1);
}
function unsecure_concat($params) {
return "('" . implode("','", $params) . "')";
}
$params = [];
$param_types = '';
for ($i = 0; $i < NUM_PARAMS; $i++) {
$params[] = rand_string();
$param_types .= 's';
}
$big_query = <<<SQL
SELECT c.product, a.name, b.value
FROM b
INNER JOIN a ON b.a_id = a.id AND a.name IN ('1be6f9eb563f3bf85c78b4219bf09de9')
INNER JOIN c on c.b_id = b.id and c.product IN
SQL . sql_question_marks(count($params));
$non_parameterized = <<<SQL
SELECT c.product, a.name, b.value
FROM b
INNER JOIN a ON b.a_id = a.id AND a.name IN ('1be6f9eb563f3bf85c78b4219bf09de9')
INNER JOIN c on c.b_id = b.id and c.product IN
SQL . unsecure_concat($params);
$connection = new mysqli(HOST, USER, PASS, DB_NAME);
$q = $connection->prepare($big_query);
$q->bind_param($param_types, ...$params);
$start_time = hrtime(true);
$q->execute(); // This one shows the issue...100-250 ms execution time in MySQL 5.7 and ~10 seconds with 8.0.
$end_time = hrtime(true);
$total_time = ($end_time - $start_time) / 1000000000; // convert to seconds
echo 'The total time for parameterized query is ' . $total_time . ' seconds.';
$q->get_result(); // not concerned with results.
$q = $connection->prepare($big_query . ' LIMIT 1');
$q->bind_param($param_types, ...$params);
$start_time = hrtime(true);
$q->execute(); // This one also shows the issue...100-250 ms execution time in MySQL 5.7 and ~10 seconds with 8.0.
$end_time = hrtime(true);
$total_time = ($end_time - $start_time) / 1000000000; // convert to seconds
echo '<br>The total time for parameterized query with limit 1 is ' . $total_time . ' seconds.';
$q->get_result(); // not concerned with results
$q = $connection->prepare($non_parameterized);
$start_time = hrtime(true);
$q->execute(); // Same execution time in 5.7 and 8.0.
$end_time = hrtime(true);
$total_time = ($end_time - $start_time) / 1000000000; // convert to seconds
echo '<br>The total time for non-parameterized query is ' . $total_time . ' seconds.';
您可以在此处下载示例数据:https://drive.google.com/file/d/111T7g1NowfWO_uZ2AhT9jdj4LiSNck8u/view?usp=sharing https://drive.google.com/file/d/111T7g1NowfWO_uZ2AhT9jdj4LiSNck8u/view?usp=sharing
编辑:这是带有 7,000 个绑定参数的 JSON 解释。
{
"EXPLAIN": {
"query_block": {
"select_id": 1,
"cost_info": {
"query_cost": "456.60"
},
"nested_loop": [
{
"table": {
"table_name": "a",
"access_type": "const",
"possible_keys": [
"PRIMARY",
"name",
"name_id"
],
"key": "name",
"used_key_parts": [
"name"
],
"key_length": "257",
"ref": [
"const"
],
"rows_examined_per_scan": 1,
"rows_produced_per_join": 1,
"filtered": "100.00",
"using_index": true,
"cost_info": {
"read_cost": "0.00",
"eval_cost": "0.10",
"prefix_cost": "0.00",
"data_read_per_join": "264"
},
"used_columns": [
"id",
"name"
]
}
},
{
"table": {
"table_name": "b",
"access_type": "ref",
"possible_keys": [
"PRIMARY",
"uniquevalue",
"a_id"
],
"key": "uniquevalue",
"used_key_parts": [
"a_id"
],
"key_length": "4",
"ref": [
"const"
],
"rows_examined_per_scan": 87,
"rows_produced_per_join": 87,
"filtered": "100.00",
"using_index": true,
"cost_info": {
"read_cost": "8.44",
"eval_cost": "8.70",
"prefix_cost": "17.14",
"data_read_per_join": "65K"
},
"used_columns": [
"id",
"a_id",
"value"
]
}
},
{
"table": {
"table_name": "c",
"access_type": "ref",
"possible_keys": [
"PRIMARY",
"b_id",
"product"
],
"key": "b_id",
"used_key_parts": [
"b_id"
],
"key_length": "4",
"ref": [
"TestDatabase.b.id"
],
"rows_examined_per_scan": 35,
"rows_produced_per_join": 564,
"filtered": "18.28",
"using_index": true,
"cost_info": {
"read_cost": "130.53",
"eval_cost": "56.47",
"prefix_cost": "456.60",
"data_read_per_join": "88K"
},
"used_columns": [
"product",
"b_id"
],
"attached_condition": "" // i've omitted the condition since it breaks the SO char limit, it contains 7,000 random character strings at 10 length each
}
}
]
}
}
}