Line data Source code
1 : // Unit tests for the Arrow-flavored chunk -> BigQuery cell converter.
2 : // The tests run a handful of canonical query shapes through a real
3 : // in-memory DuckDB connection (using the libduckdb C API the engine
4 : // itself drives) and assert that each fetched data chunk lowers to
5 : // the canonical `storage::Value` shape the engine streams back to
6 : // the gateway.
7 : //
8 : // We exercise the converter against the chunk API rather than
9 : // against the higher-level `DuckDbExecutor` so the test does not
10 : // pull in GoogleSQL: arrow_to_bq is purely a libduckdb-on-top-of-
11 : // schema helper. The `DuckDbExecutor` integration is covered
12 : // separately by `duckdb_executor_test.cc`,
13 : // `//backend/engine/coordinator:local_coordinator_engine_test`,
14 : // and the `gateway/e2e/query_duckdb_*` integration suite.
15 :
16 : #include "backend/engine/duckdb/arrow_to_bq.h"
17 :
18 : #include <string>
19 : #include <utility>
20 : #include <vector>
21 :
22 : #include "absl/status/status.h"
23 : #include "absl/strings/string_view.h"
24 : #include "backend/schema/schema.h"
25 : #include "backend/storage/storage.h"
26 : #include "duckdb.h"
27 : #include "gtest/gtest.h"
28 :
29 : namespace bigquery_emulator {
30 : namespace backend {
31 : namespace engine {
32 : namespace duckdb {
33 : namespace arrow_to_bq {
34 : namespace {
35 :
36 : // Opens a fresh in-memory DuckDB database + connection for each test.
37 : // The destructor tears them down in the right order (connection
38 : // before database) so the per-test state never leaks.
39 : class ArrowToBqTest : public ::testing::Test {
40 : protected:
41 14 : void SetUp() override {
42 14 : ASSERT_EQ(::duckdb_open(nullptr, &db_), ::DuckDBSuccess);
43 14 : ASSERT_EQ(::duckdb_connect(db_, &conn_), ::DuckDBSuccess);
44 14 : }
45 14 : void TearDown() override {
46 14 : if (conn_ != nullptr) ::duckdb_disconnect(&conn_);
47 14 : if (db_ != nullptr) ::duckdb_close(&db_);
48 14 : }
49 :
50 : // Runs `sql` and returns a vector of fetched rows, each rendered
51 : // against `output_schema`. The helper exhausts every chunk via
52 : // duckdb_fetch_chunk so the per-test assertions can read the full
53 : // result without juggling chunk lifetime.
54 : std::vector<storage::Row> RunAndFetch(absl::string_view sql,
55 13 : const schema::TableSchema& schema) {
56 13 : ::duckdb_result result;
57 13 : const std::string sql_str(sql);
58 26 : EXPECT_EQ(::duckdb_query(conn_, sql_str.c_str(), &result), ::DuckDBSuccess)
59 26 : << ::duckdb_result_error(&result);
60 13 : std::vector<storage::Row> rows;
61 26 : while (true) {
62 26 : ::duckdb_data_chunk chunk = ::duckdb_fetch_chunk(result);
63 26 : if (chunk == nullptr) break;
64 13 : const ::idx_t size = ::duckdb_data_chunk_get_size(chunk);
65 31 : for (::idx_t r = 0; r < size; ++r) {
66 18 : auto row = ChunkRowToCells(chunk, r, schema);
67 36 : EXPECT_TRUE(row.ok()) << row.status();
68 18 : if (row.ok()) rows.push_back(std::move(row).value());
69 18 : }
70 13 : ::duckdb_destroy_data_chunk(&chunk);
71 13 : }
72 13 : ::duckdb_destroy_result(&result);
73 13 : return rows;
74 13 : }
75 :
76 : ::duckdb_database db_ = nullptr;
77 : ::duckdb_connection conn_ = nullptr;
78 : };
79 :
80 : // SELECT 1 against a single-column INT64 schema is the simplest
81 : // shape: an INT64 vector with one element, no validity mask, and
82 : // the rendered cell carries the analyzer's int64 value untouched.
83 1 : TEST_F(ArrowToBqTest, RendersBigintAsInt64Cell) {
84 1 : schema::TableSchema s;
85 1 : schema::ColumnSchema col;
86 1 : col.name = "n";
87 1 : col.type = schema::ColumnType::kInt64;
88 1 : s.columns.push_back(col);
89 :
90 1 : std::vector<storage::Row> rows =
91 1 : RunAndFetch("SELECT CAST(42 AS BIGINT) AS n", s);
92 1 : ASSERT_EQ(rows.size(), 1u);
93 1 : ASSERT_EQ(rows[0].cells.size(), 1u);
94 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kInt64);
95 1 : EXPECT_EQ(rows[0].cells[0].int64_value(), 42);
96 1 : }
97 :
98 : // VARCHAR cells round-trip through duckdb_string_t's inline-or-pointer
99 : // storage. We exercise both: "ada" fits in the 12-byte inline slot,
100 : // and "abracadabra-extra-long-string" lives behind the pointer leg.
101 1 : TEST_F(ArrowToBqTest, RendersVarcharBothInlineAndPointer) {
102 1 : schema::TableSchema s;
103 1 : schema::ColumnSchema col;
104 1 : col.name = "name";
105 1 : col.type = schema::ColumnType::kString;
106 1 : s.columns.push_back(col);
107 :
108 1 : std::vector<storage::Row> rows = RunAndFetch(
109 1 : "SELECT 'ada' UNION ALL SELECT 'abracadabra-extra-long-string' "
110 1 : "ORDER BY 1",
111 1 : s);
112 1 : ASSERT_EQ(rows.size(), 2u);
113 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
114 1 : EXPECT_EQ(rows[0].cells[0].string_value(), "abracadabra-extra-long-string");
115 1 : EXPECT_EQ(rows[1].cells[0].kind(), storage::Value::Kind::kString);
116 1 : EXPECT_EQ(rows[1].cells[0].string_value(), "ada");
117 1 : }
118 :
119 : // NULL cells must be detected via the validity mask, not via a raw
120 : // data-pointer read. The fixture column is NULLABLE so the analyzer
121 : // allocates a mask we can exercise.
122 1 : TEST_F(ArrowToBqTest, NullValidityMaskProducesNullCell) {
123 1 : schema::TableSchema s;
124 1 : schema::ColumnSchema col;
125 1 : col.name = "v";
126 1 : col.type = schema::ColumnType::kInt64;
127 1 : s.columns.push_back(col);
128 :
129 1 : std::vector<storage::Row> rows = RunAndFetch(
130 1 : "SELECT CAST(NULL AS BIGINT) UNION ALL SELECT CAST(7 AS BIGINT) "
131 1 : "ORDER BY 1 NULLS FIRST",
132 1 : s);
133 1 : ASSERT_EQ(rows.size(), 2u);
134 1 : EXPECT_TRUE(rows[0].cells[0].is_null());
135 1 : EXPECT_EQ(rows[1].cells[0].kind(), storage::Value::Kind::kInt64);
136 1 : EXPECT_EQ(rows[1].cells[0].int64_value(), 7);
137 1 : }
138 :
139 : // BOOL / FLOAT64 / DATE / TIMESTAMP cover the per-type renderings
140 : // every BigQuery cell shape relies on. We pin the exact string
141 : // formats for DATE / TIMESTAMP because those are what the gateway
142 : // passes through verbatim to the REST wire envelope.
143 1 : TEST_F(ArrowToBqTest, RendersScalarTypeFamily) {
144 1 : schema::TableSchema s;
145 1 : schema::ColumnSchema flag;
146 1 : flag.name = "flag";
147 1 : flag.type = schema::ColumnType::kBool;
148 1 : schema::ColumnSchema score;
149 1 : score.name = "score";
150 1 : score.type = schema::ColumnType::kFloat64;
151 1 : schema::ColumnSchema dt;
152 1 : dt.name = "dt";
153 1 : dt.type = schema::ColumnType::kDate;
154 1 : schema::ColumnSchema ts;
155 1 : ts.name = "ts";
156 1 : ts.type = schema::ColumnType::kTimestamp;
157 1 : s.columns = {flag, score, dt, ts};
158 :
159 1 : std::vector<storage::Row> rows = RunAndFetch(
160 1 : "SELECT TRUE AS flag, CAST(1.5 AS DOUBLE) AS score, "
161 1 : "DATE '2024-03-09' AS dt, TIMESTAMP '2024-03-09 12:34:56' AS ts",
162 1 : s);
163 1 : ASSERT_EQ(rows.size(), 1u);
164 1 : ASSERT_EQ(rows[0].cells.size(), 4u);
165 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kBool);
166 1 : EXPECT_TRUE(rows[0].cells[0].bool_value());
167 1 : EXPECT_EQ(rows[0].cells[1].kind(), storage::Value::Kind::kFloat64);
168 1 : EXPECT_DOUBLE_EQ(rows[0].cells[1].float64_value(), 1.5);
169 1 : EXPECT_EQ(rows[0].cells[2].kind(), storage::Value::Kind::kString);
170 1 : EXPECT_EQ(rows[0].cells[2].string_value(), "2024-03-09");
171 1 : EXPECT_EQ(rows[0].cells[3].kind(), storage::Value::Kind::kString);
172 1 : EXPECT_EQ(rows[0].cells[3].string_value(), "2024-03-09 12:34:56.000000");
173 1 : }
174 :
175 1 : TEST_F(ArrowToBqTest, RendersTimestampTzShortOffsetAsUtcTimestamp) {
176 1 : schema::TableSchema s;
177 1 : schema::ColumnSchema ts;
178 1 : ts.name = "ts";
179 1 : ts.type = schema::ColumnType::kTimestamp;
180 1 : s.columns.push_back(ts);
181 :
182 1 : std::vector<storage::Row> rows =
183 1 : RunAndFetch("SELECT TIMESTAMPTZ '2025-12-01 10:49:40+00' AS ts", s);
184 1 : ASSERT_EQ(rows.size(), 1u);
185 1 : ASSERT_EQ(rows[0].cells.size(), 1u);
186 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
187 1 : EXPECT_EQ(rows[0].cells[0].string_value(), "2025-12-01 10:49:40.000000");
188 1 : }
189 :
190 : // LIST vectors lower into `Value::Array`, with each element rendered
191 : // against the inner column schema. The fixture mixes a non-null and
192 : // an empty array so the offset / length math gets exercised twice.
193 1 : TEST_F(ArrowToBqTest, RendersListAsRepeatedArrayCell) {
194 1 : schema::TableSchema s;
195 1 : schema::ColumnSchema col;
196 1 : col.name = "tags";
197 1 : col.type = schema::ColumnType::kString;
198 1 : col.mode = schema::ColumnMode::kRepeated;
199 1 : s.columns.push_back(col);
200 :
201 1 : std::vector<storage::Row> rows = RunAndFetch(
202 1 : "SELECT * FROM (VALUES (CAST([] AS VARCHAR[])), "
203 1 : "(['x', 'y'])) AS t(tags) ORDER BY array_length(tags)",
204 1 : s);
205 1 : ASSERT_EQ(rows.size(), 2u);
206 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kArray);
207 1 : EXPECT_TRUE(rows[0].cells[0].array_value().empty());
208 1 : EXPECT_EQ(rows[1].cells[0].kind(), storage::Value::Kind::kArray);
209 1 : ASSERT_EQ(rows[1].cells[0].array_value().size(), 2u);
210 1 : EXPECT_EQ(rows[1].cells[0].array_value()[0].string_value(), "x");
211 1 : EXPECT_EQ(rows[1].cells[0].array_value()[1].string_value(), "y");
212 1 : }
213 :
214 : // STRUCT vectors lower into `Value::Struct` with positional fields in
215 : // the same order the analyzer's `ColumnSchema::fields` lists them.
216 1 : TEST_F(ArrowToBqTest, RendersStructAsStructCell) {
217 1 : schema::TableSchema s;
218 1 : schema::ColumnSchema col;
219 1 : col.name = "rec";
220 1 : col.type = schema::ColumnType::kStruct;
221 1 : schema::ColumnSchema f_id;
222 1 : f_id.name = "id";
223 1 : f_id.type = schema::ColumnType::kInt64;
224 1 : schema::ColumnSchema f_name;
225 1 : f_name.name = "name";
226 1 : f_name.type = schema::ColumnType::kString;
227 1 : col.fields = {f_id, f_name};
228 1 : s.columns.push_back(col);
229 :
230 1 : std::vector<storage::Row> rows = RunAndFetch(
231 1 : "SELECT {'id': CAST(7 AS BIGINT), 'name': 'grace'} AS rec", s);
232 1 : ASSERT_EQ(rows.size(), 1u);
233 1 : ASSERT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kStruct);
234 1 : ASSERT_EQ(rows[0].cells[0].struct_value().size(), 2u);
235 1 : EXPECT_EQ(rows[0].cells[0].struct_value()[0].int64_value(), 7);
236 1 : EXPECT_EQ(rows[0].cells[0].struct_value()[1].string_value(), "grace");
237 1 : }
238 :
239 : // Window SUM over integers often lands as HUGEINT vectors while the
240 : // analyzer still types the output column INT64.
241 1 : TEST_F(ArrowToBqTest, RendersHugeintWindowSumAsInt64) {
242 1 : schema::TableSchema s;
243 1 : schema::ColumnSchema sum_col;
244 1 : sum_col.name = "running_sum";
245 1 : sum_col.type = schema::ColumnType::kInt64;
246 1 : s.columns.push_back(sum_col);
247 :
248 1 : std::vector<storage::Row> rows = RunAndFetch(
249 1 : "SELECT SUM(x) OVER (ORDER BY x ROWS BETWEEN UNBOUNDED PRECEDING AND "
250 1 : "CURRENT ROW) AS running_sum "
251 1 : "FROM (VALUES (1), (2), (3)) AS t(x)",
252 1 : s);
253 1 : ASSERT_EQ(rows.size(), 3u);
254 1 : EXPECT_EQ(rows[0].cells[0].int64_value(), 1);
255 1 : EXPECT_EQ(rows[1].cells[0].int64_value(), 3);
256 1 : EXPECT_EQ(rows[2].cells[0].int64_value(), 6);
257 1 : }
258 :
259 : // SUM over DOUBLE inputs is promoted to DECIMAL in DuckDB; FLOAT64
260 : // output columns must still marshal (group_by / rollup fixtures).
261 1 : TEST_F(ArrowToBqTest, RendersDecimalSumAsFloat64) {
262 1 : schema::TableSchema s;
263 1 : schema::ColumnSchema total;
264 1 : total.name = "total";
265 1 : total.type = schema::ColumnType::kFloat64;
266 1 : s.columns.push_back(total);
267 :
268 1 : std::vector<storage::Row> rows = RunAndFetch(
269 1 : "SELECT SUM(amount) AS total FROM (VALUES (10.5), (2.25)) AS t(amount)",
270 1 : s);
271 1 : ASSERT_EQ(rows.size(), 1u);
272 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kFloat64);
273 1 : EXPECT_DOUBLE_EQ(rows[0].cells[0].float64_value(), 12.75);
274 1 : }
275 :
276 : // A NUMERIC column read straight out of a DECIMAL(38, 9) vector is
277 : // HUGEINT-backed (precision 38 > 18). The int128 formatter must
278 : // render it exactly as a decimal string rather than surfacing
279 : // UNIMPLEMENTED or losing precision via a double hop.
280 1 : TEST_F(ArrowToBqTest, RendersHugeintDecimalNumericExactly) {
281 1 : schema::TableSchema s;
282 1 : schema::ColumnSchema amount;
283 1 : amount.name = "amount";
284 1 : amount.type = schema::ColumnType::kNumeric;
285 1 : s.columns.push_back(amount);
286 :
287 1 : std::vector<storage::Row> rows = RunAndFetch(
288 1 : "SELECT CAST('12345678901234567890.123456789' AS DECIMAL(38, 9)) "
289 1 : "AS amount",
290 1 : s);
291 1 : ASSERT_EQ(rows.size(), 1u);
292 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
293 1 : EXPECT_EQ(rows[0].cells[0].string_value(), "12345678901234567890.123456789");
294 1 : }
295 :
296 : // SUM over a NUMERIC column widens DuckDB's DECIMAL to a HUGEINT-backed
297 : // internal type while the analyzer still types the output NUMERIC.
298 : // This is the exact shape `SkipEmulatorNumericAggregateQuery` guarded.
299 1 : TEST_F(ArrowToBqTest, RendersNumericSumAsDecimalString) {
300 1 : schema::TableSchema s;
301 1 : schema::ColumnSchema total;
302 1 : total.name = "total";
303 1 : total.type = schema::ColumnType::kNumeric;
304 1 : s.columns.push_back(total);
305 :
306 1 : std::vector<storage::Row> rows = RunAndFetch(
307 1 : "SELECT SUM(amount) AS total FROM (VALUES "
308 1 : "(CAST('1.50' AS DECIMAL(38, 9))), "
309 1 : "(CAST('2.25' AS DECIMAL(38, 9))), "
310 1 : "(CAST('3.75' AS DECIMAL(38, 9)))) AS t(amount)",
311 1 : s);
312 1 : ASSERT_EQ(rows.size(), 1u);
313 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
314 1 : EXPECT_EQ(rows[0].cells[0].string_value(), "7.500000000");
315 1 : }
316 :
317 : // Negative HUGEINT-backed decimals must keep the sign and pad the
318 : // fractional part. Pins the (v + 1) negate path in the int128 renderer.
319 1 : TEST_F(ArrowToBqTest, RendersNegativeHugeintDecimal) {
320 1 : schema::TableSchema s;
321 1 : schema::ColumnSchema amount;
322 1 : amount.name = "amount";
323 1 : amount.type = schema::ColumnType::kNumeric;
324 1 : s.columns.push_back(amount);
325 :
326 1 : std::vector<storage::Row> rows = RunAndFetch(
327 1 : "SELECT CAST('-99999999999999999999999999999.999999999' AS "
328 1 : "DECIMAL(38, 9)) AS amount",
329 1 : s);
330 1 : ASSERT_EQ(rows.size(), 1u);
331 1 : EXPECT_EQ(rows[0].cells[0].string_value(),
332 1 : "-99999999999999999999999999999.999999999");
333 1 : }
334 :
335 : // BIGNUMERIC columns are materialized as VARCHAR (DuckDB cannot hold
336 : // the range in a DECIMAL), so a NUMERIC/BIGNUMERIC column backed by a
337 : // VARCHAR vector reads the decimal text straight through.
338 1 : TEST_F(ArrowToBqTest, RendersVarcharBackedBignumericAsString) {
339 1 : schema::TableSchema s;
340 1 : schema::ColumnSchema amount;
341 1 : amount.name = "amount";
342 1 : amount.type = schema::ColumnType::kBignumeric;
343 1 : s.columns.push_back(amount);
344 :
345 1 : std::vector<storage::Row> rows = RunAndFetch(
346 1 : "SELECT CAST('578960446186580977117854925043439539266."
347 1 : "34992332820282019728792003956564819967' AS VARCHAR) AS amount",
348 1 : s);
349 1 : ASSERT_EQ(rows.size(), 1u);
350 1 : EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
351 1 : EXPECT_EQ(rows[0].cells[0].string_value(),
352 1 : "578960446186580977117854925043439539266."
353 1 : "34992332820282019728792003956564819967");
354 1 : }
355 :
356 : // Column-count mismatch surfaces as INVALID_ARGUMENT instead of
357 : // silently truncating the rendered row. This is the contract the
358 : // engine relies on when the analyzer and DuckDB disagree on shape.
359 1 : TEST_F(ArrowToBqTest, ColumnCountMismatchReturnsInvalidArgument) {
360 1 : schema::TableSchema s;
361 1 : schema::ColumnSchema a;
362 1 : a.name = "a";
363 1 : a.type = schema::ColumnType::kInt64;
364 1 : schema::ColumnSchema b;
365 1 : b.name = "b";
366 1 : b.type = schema::ColumnType::kInt64;
367 1 : s.columns = {a, b};
368 :
369 1 : ::duckdb_result result;
370 1 : ASSERT_EQ(::duckdb_query(conn_, "SELECT CAST(1 AS BIGINT)", &result),
371 1 : ::DuckDBSuccess);
372 1 : ::duckdb_data_chunk chunk = ::duckdb_fetch_chunk(result);
373 1 : ASSERT_NE(chunk, nullptr);
374 1 : auto row = ChunkRowToCells(chunk, 0, s);
375 1 : EXPECT_FALSE(row.ok());
376 1 : EXPECT_EQ(row.status().code(), absl::StatusCode::kInvalidArgument);
377 1 : ::duckdb_destroy_data_chunk(&chunk);
378 1 : ::duckdb_destroy_result(&result);
379 1 : }
380 :
381 : } // namespace
382 : } // namespace arrow_to_bq
383 : } // namespace duckdb
384 : } // namespace engine
385 : } // namespace backend
386 : } // namespace bigquery_emulator
|