LCOV - code coverage report
Current view: top level - backend/engine/duckdb - arrow_to_bq_test.cc (source / functions) Coverage Total Hit
Test: _coverage_report.dat Lines: 100.0 % 264 264
Test Date: 2026-07-02 21:01:18 Functions: 100.0 % 17 17

            Line data    Source code
       1              : // Unit tests for the Arrow-flavored chunk -> BigQuery cell converter.
       2              : // The tests run a handful of canonical query shapes through a real
       3              : // in-memory DuckDB connection (using the libduckdb C API the engine
       4              : // itself drives) and assert that each fetched data chunk lowers to
       5              : // the canonical `storage::Value` shape the engine streams back to
       6              : // the gateway.
       7              : //
       8              : // We exercise the converter against the chunk API rather than
       9              : // against the higher-level `DuckDbExecutor` so the test does not
      10              : // pull in GoogleSQL: arrow_to_bq is purely a libduckdb-on-top-of-
      11              : // schema helper. The `DuckDbExecutor` integration is covered
      12              : // separately by `duckdb_executor_test.cc`,
      13              : // `//backend/engine/coordinator:local_coordinator_engine_test`,
      14              : // and the `gateway/e2e/query_duckdb_*` integration suite.
      15              : 
      16              : #include "backend/engine/duckdb/arrow_to_bq.h"
      17              : 
      18              : #include <string>
      19              : #include <utility>
      20              : #include <vector>
      21              : 
      22              : #include "absl/status/status.h"
      23              : #include "absl/strings/string_view.h"
      24              : #include "backend/schema/schema.h"
      25              : #include "backend/storage/storage.h"
      26              : #include "duckdb.h"
      27              : #include "gtest/gtest.h"
      28              : 
      29              : namespace bigquery_emulator {
      30              : namespace backend {
      31              : namespace engine {
      32              : namespace duckdb {
      33              : namespace arrow_to_bq {
      34              : namespace {
      35              : 
      36              : // Opens a fresh in-memory DuckDB database + connection for each test.
      37              : // The destructor tears them down in the right order (connection
      38              : // before database) so the per-test state never leaks.
      39              : class ArrowToBqTest : public ::testing::Test {
      40              :  protected:
      41           14 :   void SetUp() override {
      42           14 :     ASSERT_EQ(::duckdb_open(nullptr, &db_), ::DuckDBSuccess);
      43           14 :     ASSERT_EQ(::duckdb_connect(db_, &conn_), ::DuckDBSuccess);
      44           14 :   }
      45           14 :   void TearDown() override {
      46           14 :     if (conn_ != nullptr) ::duckdb_disconnect(&conn_);
      47           14 :     if (db_ != nullptr) ::duckdb_close(&db_);
      48           14 :   }
      49              : 
      50              :   // Runs `sql` and returns a vector of fetched rows, each rendered
      51              :   // against `output_schema`. The helper exhausts every chunk via
      52              :   // duckdb_fetch_chunk so the per-test assertions can read the full
      53              :   // result without juggling chunk lifetime.
      54              :   std::vector<storage::Row> RunAndFetch(absl::string_view sql,
      55           13 :                                         const schema::TableSchema& schema) {
      56           13 :     ::duckdb_result result;
      57           13 :     const std::string sql_str(sql);
      58           26 :     EXPECT_EQ(::duckdb_query(conn_, sql_str.c_str(), &result), ::DuckDBSuccess)
      59           26 :         << ::duckdb_result_error(&result);
      60           13 :     std::vector<storage::Row> rows;
      61           26 :     while (true) {
      62           26 :       ::duckdb_data_chunk chunk = ::duckdb_fetch_chunk(result);
      63           26 :       if (chunk == nullptr) break;
      64           13 :       const ::idx_t size = ::duckdb_data_chunk_get_size(chunk);
      65           31 :       for (::idx_t r = 0; r < size; ++r) {
      66           18 :         auto row = ChunkRowToCells(chunk, r, schema);
      67           36 :         EXPECT_TRUE(row.ok()) << row.status();
      68           18 :         if (row.ok()) rows.push_back(std::move(row).value());
      69           18 :       }
      70           13 :       ::duckdb_destroy_data_chunk(&chunk);
      71           13 :     }
      72           13 :     ::duckdb_destroy_result(&result);
      73           13 :     return rows;
      74           13 :   }
      75              : 
      76              :   ::duckdb_database db_ = nullptr;
      77              :   ::duckdb_connection conn_ = nullptr;
      78              : };
      79              : 
      80              : // SELECT 1 against a single-column INT64 schema is the simplest
      81              : // shape: an INT64 vector with one element, no validity mask, and
      82              : // the rendered cell carries the analyzer's int64 value untouched.
      83            1 : TEST_F(ArrowToBqTest, RendersBigintAsInt64Cell) {
      84            1 :   schema::TableSchema s;
      85            1 :   schema::ColumnSchema col;
      86            1 :   col.name = "n";
      87            1 :   col.type = schema::ColumnType::kInt64;
      88            1 :   s.columns.push_back(col);
      89              : 
      90            1 :   std::vector<storage::Row> rows =
      91            1 :       RunAndFetch("SELECT CAST(42 AS BIGINT) AS n", s);
      92            1 :   ASSERT_EQ(rows.size(), 1u);
      93            1 :   ASSERT_EQ(rows[0].cells.size(), 1u);
      94            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kInt64);
      95            1 :   EXPECT_EQ(rows[0].cells[0].int64_value(), 42);
      96            1 : }
      97              : 
      98              : // VARCHAR cells round-trip through duckdb_string_t's inline-or-pointer
      99              : // storage. We exercise both: "ada" fits in the 12-byte inline slot,
     100              : // and "abracadabra-extra-long-string" lives behind the pointer leg.
     101            1 : TEST_F(ArrowToBqTest, RendersVarcharBothInlineAndPointer) {
     102            1 :   schema::TableSchema s;
     103            1 :   schema::ColumnSchema col;
     104            1 :   col.name = "name";
     105            1 :   col.type = schema::ColumnType::kString;
     106            1 :   s.columns.push_back(col);
     107              : 
     108            1 :   std::vector<storage::Row> rows = RunAndFetch(
     109            1 :       "SELECT 'ada' UNION ALL SELECT 'abracadabra-extra-long-string' "
     110            1 :       "ORDER BY 1",
     111            1 :       s);
     112            1 :   ASSERT_EQ(rows.size(), 2u);
     113            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
     114            1 :   EXPECT_EQ(rows[0].cells[0].string_value(), "abracadabra-extra-long-string");
     115            1 :   EXPECT_EQ(rows[1].cells[0].kind(), storage::Value::Kind::kString);
     116            1 :   EXPECT_EQ(rows[1].cells[0].string_value(), "ada");
     117            1 : }
     118              : 
     119              : // NULL cells must be detected via the validity mask, not via a raw
     120              : // data-pointer read. The fixture column is NULLABLE so the analyzer
     121              : // allocates a mask we can exercise.
     122            1 : TEST_F(ArrowToBqTest, NullValidityMaskProducesNullCell) {
     123            1 :   schema::TableSchema s;
     124            1 :   schema::ColumnSchema col;
     125            1 :   col.name = "v";
     126            1 :   col.type = schema::ColumnType::kInt64;
     127            1 :   s.columns.push_back(col);
     128              : 
     129            1 :   std::vector<storage::Row> rows = RunAndFetch(
     130            1 :       "SELECT CAST(NULL AS BIGINT) UNION ALL SELECT CAST(7 AS BIGINT) "
     131            1 :       "ORDER BY 1 NULLS FIRST",
     132            1 :       s);
     133            1 :   ASSERT_EQ(rows.size(), 2u);
     134            1 :   EXPECT_TRUE(rows[0].cells[0].is_null());
     135            1 :   EXPECT_EQ(rows[1].cells[0].kind(), storage::Value::Kind::kInt64);
     136            1 :   EXPECT_EQ(rows[1].cells[0].int64_value(), 7);
     137            1 : }
     138              : 
     139              : // BOOL / FLOAT64 / DATE / TIMESTAMP cover the per-type renderings
     140              : // every BigQuery cell shape relies on. We pin the exact string
     141              : // formats for DATE / TIMESTAMP because those are what the gateway
     142              : // passes through verbatim to the REST wire envelope.
     143            1 : TEST_F(ArrowToBqTest, RendersScalarTypeFamily) {
     144            1 :   schema::TableSchema s;
     145            1 :   schema::ColumnSchema flag;
     146            1 :   flag.name = "flag";
     147            1 :   flag.type = schema::ColumnType::kBool;
     148            1 :   schema::ColumnSchema score;
     149            1 :   score.name = "score";
     150            1 :   score.type = schema::ColumnType::kFloat64;
     151            1 :   schema::ColumnSchema dt;
     152            1 :   dt.name = "dt";
     153            1 :   dt.type = schema::ColumnType::kDate;
     154            1 :   schema::ColumnSchema ts;
     155            1 :   ts.name = "ts";
     156            1 :   ts.type = schema::ColumnType::kTimestamp;
     157            1 :   s.columns = {flag, score, dt, ts};
     158              : 
     159            1 :   std::vector<storage::Row> rows = RunAndFetch(
     160            1 :       "SELECT TRUE AS flag, CAST(1.5 AS DOUBLE) AS score, "
     161            1 :       "DATE '2024-03-09' AS dt, TIMESTAMP '2024-03-09 12:34:56' AS ts",
     162            1 :       s);
     163            1 :   ASSERT_EQ(rows.size(), 1u);
     164            1 :   ASSERT_EQ(rows[0].cells.size(), 4u);
     165            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kBool);
     166            1 :   EXPECT_TRUE(rows[0].cells[0].bool_value());
     167            1 :   EXPECT_EQ(rows[0].cells[1].kind(), storage::Value::Kind::kFloat64);
     168            1 :   EXPECT_DOUBLE_EQ(rows[0].cells[1].float64_value(), 1.5);
     169            1 :   EXPECT_EQ(rows[0].cells[2].kind(), storage::Value::Kind::kString);
     170            1 :   EXPECT_EQ(rows[0].cells[2].string_value(), "2024-03-09");
     171            1 :   EXPECT_EQ(rows[0].cells[3].kind(), storage::Value::Kind::kString);
     172            1 :   EXPECT_EQ(rows[0].cells[3].string_value(), "2024-03-09 12:34:56.000000");
     173            1 : }
     174              : 
     175            1 : TEST_F(ArrowToBqTest, RendersTimestampTzShortOffsetAsUtcTimestamp) {
     176            1 :   schema::TableSchema s;
     177            1 :   schema::ColumnSchema ts;
     178            1 :   ts.name = "ts";
     179            1 :   ts.type = schema::ColumnType::kTimestamp;
     180            1 :   s.columns.push_back(ts);
     181              : 
     182            1 :   std::vector<storage::Row> rows =
     183            1 :       RunAndFetch("SELECT TIMESTAMPTZ '2025-12-01 10:49:40+00' AS ts", s);
     184            1 :   ASSERT_EQ(rows.size(), 1u);
     185            1 :   ASSERT_EQ(rows[0].cells.size(), 1u);
     186            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
     187            1 :   EXPECT_EQ(rows[0].cells[0].string_value(), "2025-12-01 10:49:40.000000");
     188            1 : }
     189              : 
     190              : // LIST vectors lower into `Value::Array`, with each element rendered
     191              : // against the inner column schema. The fixture mixes a non-null and
     192              : // an empty array so the offset / length math gets exercised twice.
     193            1 : TEST_F(ArrowToBqTest, RendersListAsRepeatedArrayCell) {
     194            1 :   schema::TableSchema s;
     195            1 :   schema::ColumnSchema col;
     196            1 :   col.name = "tags";
     197            1 :   col.type = schema::ColumnType::kString;
     198            1 :   col.mode = schema::ColumnMode::kRepeated;
     199            1 :   s.columns.push_back(col);
     200              : 
     201            1 :   std::vector<storage::Row> rows = RunAndFetch(
     202            1 :       "SELECT * FROM (VALUES (CAST([] AS VARCHAR[])), "
     203            1 :       "(['x', 'y'])) AS t(tags) ORDER BY array_length(tags)",
     204            1 :       s);
     205            1 :   ASSERT_EQ(rows.size(), 2u);
     206            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kArray);
     207            1 :   EXPECT_TRUE(rows[0].cells[0].array_value().empty());
     208            1 :   EXPECT_EQ(rows[1].cells[0].kind(), storage::Value::Kind::kArray);
     209            1 :   ASSERT_EQ(rows[1].cells[0].array_value().size(), 2u);
     210            1 :   EXPECT_EQ(rows[1].cells[0].array_value()[0].string_value(), "x");
     211            1 :   EXPECT_EQ(rows[1].cells[0].array_value()[1].string_value(), "y");
     212            1 : }
     213              : 
     214              : // STRUCT vectors lower into `Value::Struct` with positional fields in
     215              : // the same order the analyzer's `ColumnSchema::fields` lists them.
     216            1 : TEST_F(ArrowToBqTest, RendersStructAsStructCell) {
     217            1 :   schema::TableSchema s;
     218            1 :   schema::ColumnSchema col;
     219            1 :   col.name = "rec";
     220            1 :   col.type = schema::ColumnType::kStruct;
     221            1 :   schema::ColumnSchema f_id;
     222            1 :   f_id.name = "id";
     223            1 :   f_id.type = schema::ColumnType::kInt64;
     224            1 :   schema::ColumnSchema f_name;
     225            1 :   f_name.name = "name";
     226            1 :   f_name.type = schema::ColumnType::kString;
     227            1 :   col.fields = {f_id, f_name};
     228            1 :   s.columns.push_back(col);
     229              : 
     230            1 :   std::vector<storage::Row> rows = RunAndFetch(
     231            1 :       "SELECT {'id': CAST(7 AS BIGINT), 'name': 'grace'} AS rec", s);
     232            1 :   ASSERT_EQ(rows.size(), 1u);
     233            1 :   ASSERT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kStruct);
     234            1 :   ASSERT_EQ(rows[0].cells[0].struct_value().size(), 2u);
     235            1 :   EXPECT_EQ(rows[0].cells[0].struct_value()[0].int64_value(), 7);
     236            1 :   EXPECT_EQ(rows[0].cells[0].struct_value()[1].string_value(), "grace");
     237            1 : }
     238              : 
     239              : // Window SUM over integers often lands as HUGEINT vectors while the
     240              : // analyzer still types the output column INT64.
     241            1 : TEST_F(ArrowToBqTest, RendersHugeintWindowSumAsInt64) {
     242            1 :   schema::TableSchema s;
     243            1 :   schema::ColumnSchema sum_col;
     244            1 :   sum_col.name = "running_sum";
     245            1 :   sum_col.type = schema::ColumnType::kInt64;
     246            1 :   s.columns.push_back(sum_col);
     247              : 
     248            1 :   std::vector<storage::Row> rows = RunAndFetch(
     249            1 :       "SELECT SUM(x) OVER (ORDER BY x ROWS BETWEEN UNBOUNDED PRECEDING AND "
     250            1 :       "CURRENT ROW) AS running_sum "
     251            1 :       "FROM (VALUES (1), (2), (3)) AS t(x)",
     252            1 :       s);
     253            1 :   ASSERT_EQ(rows.size(), 3u);
     254            1 :   EXPECT_EQ(rows[0].cells[0].int64_value(), 1);
     255            1 :   EXPECT_EQ(rows[1].cells[0].int64_value(), 3);
     256            1 :   EXPECT_EQ(rows[2].cells[0].int64_value(), 6);
     257            1 : }
     258              : 
     259              : // SUM over DOUBLE inputs is promoted to DECIMAL in DuckDB; FLOAT64
     260              : // output columns must still marshal (group_by / rollup fixtures).
     261            1 : TEST_F(ArrowToBqTest, RendersDecimalSumAsFloat64) {
     262            1 :   schema::TableSchema s;
     263            1 :   schema::ColumnSchema total;
     264            1 :   total.name = "total";
     265            1 :   total.type = schema::ColumnType::kFloat64;
     266            1 :   s.columns.push_back(total);
     267              : 
     268            1 :   std::vector<storage::Row> rows = RunAndFetch(
     269            1 :       "SELECT SUM(amount) AS total FROM (VALUES (10.5), (2.25)) AS t(amount)",
     270            1 :       s);
     271            1 :   ASSERT_EQ(rows.size(), 1u);
     272            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kFloat64);
     273            1 :   EXPECT_DOUBLE_EQ(rows[0].cells[0].float64_value(), 12.75);
     274            1 : }
     275              : 
     276              : // A NUMERIC column read straight out of a DECIMAL(38, 9) vector is
     277              : // HUGEINT-backed (precision 38 > 18). The int128 formatter must
     278              : // render it exactly as a decimal string rather than surfacing
     279              : // UNIMPLEMENTED or losing precision via a double hop.
     280            1 : TEST_F(ArrowToBqTest, RendersHugeintDecimalNumericExactly) {
     281            1 :   schema::TableSchema s;
     282            1 :   schema::ColumnSchema amount;
     283            1 :   amount.name = "amount";
     284            1 :   amount.type = schema::ColumnType::kNumeric;
     285            1 :   s.columns.push_back(amount);
     286              : 
     287            1 :   std::vector<storage::Row> rows = RunAndFetch(
     288            1 :       "SELECT CAST('12345678901234567890.123456789' AS DECIMAL(38, 9)) "
     289            1 :       "AS amount",
     290            1 :       s);
     291            1 :   ASSERT_EQ(rows.size(), 1u);
     292            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
     293            1 :   EXPECT_EQ(rows[0].cells[0].string_value(), "12345678901234567890.123456789");
     294            1 : }
     295              : 
     296              : // SUM over a NUMERIC column widens DuckDB's DECIMAL to a HUGEINT-backed
     297              : // internal type while the analyzer still types the output NUMERIC.
     298              : // This is the exact shape `SkipEmulatorNumericAggregateQuery` guarded.
     299            1 : TEST_F(ArrowToBqTest, RendersNumericSumAsDecimalString) {
     300            1 :   schema::TableSchema s;
     301            1 :   schema::ColumnSchema total;
     302            1 :   total.name = "total";
     303            1 :   total.type = schema::ColumnType::kNumeric;
     304            1 :   s.columns.push_back(total);
     305              : 
     306            1 :   std::vector<storage::Row> rows = RunAndFetch(
     307            1 :       "SELECT SUM(amount) AS total FROM (VALUES "
     308            1 :       "(CAST('1.50' AS DECIMAL(38, 9))), "
     309            1 :       "(CAST('2.25' AS DECIMAL(38, 9))), "
     310            1 :       "(CAST('3.75' AS DECIMAL(38, 9)))) AS t(amount)",
     311            1 :       s);
     312            1 :   ASSERT_EQ(rows.size(), 1u);
     313            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
     314            1 :   EXPECT_EQ(rows[0].cells[0].string_value(), "7.500000000");
     315            1 : }
     316              : 
     317              : // Negative HUGEINT-backed decimals must keep the sign and pad the
     318              : // fractional part. Pins the (v + 1) negate path in the int128 renderer.
     319            1 : TEST_F(ArrowToBqTest, RendersNegativeHugeintDecimal) {
     320            1 :   schema::TableSchema s;
     321            1 :   schema::ColumnSchema amount;
     322            1 :   amount.name = "amount";
     323            1 :   amount.type = schema::ColumnType::kNumeric;
     324            1 :   s.columns.push_back(amount);
     325              : 
     326            1 :   std::vector<storage::Row> rows = RunAndFetch(
     327            1 :       "SELECT CAST('-99999999999999999999999999999.999999999' AS "
     328            1 :       "DECIMAL(38, 9)) AS amount",
     329            1 :       s);
     330            1 :   ASSERT_EQ(rows.size(), 1u);
     331            1 :   EXPECT_EQ(rows[0].cells[0].string_value(),
     332            1 :             "-99999999999999999999999999999.999999999");
     333            1 : }
     334              : 
     335              : // BIGNUMERIC columns are materialized as VARCHAR (DuckDB cannot hold
     336              : // the range in a DECIMAL), so a NUMERIC/BIGNUMERIC column backed by a
     337              : // VARCHAR vector reads the decimal text straight through.
     338            1 : TEST_F(ArrowToBqTest, RendersVarcharBackedBignumericAsString) {
     339            1 :   schema::TableSchema s;
     340            1 :   schema::ColumnSchema amount;
     341            1 :   amount.name = "amount";
     342            1 :   amount.type = schema::ColumnType::kBignumeric;
     343            1 :   s.columns.push_back(amount);
     344              : 
     345            1 :   std::vector<storage::Row> rows = RunAndFetch(
     346            1 :       "SELECT CAST('578960446186580977117854925043439539266."
     347            1 :       "34992332820282019728792003956564819967' AS VARCHAR) AS amount",
     348            1 :       s);
     349            1 :   ASSERT_EQ(rows.size(), 1u);
     350            1 :   EXPECT_EQ(rows[0].cells[0].kind(), storage::Value::Kind::kString);
     351            1 :   EXPECT_EQ(rows[0].cells[0].string_value(),
     352            1 :             "578960446186580977117854925043439539266."
     353            1 :             "34992332820282019728792003956564819967");
     354            1 : }
     355              : 
     356              : // Column-count mismatch surfaces as INVALID_ARGUMENT instead of
     357              : // silently truncating the rendered row. This is the contract the
     358              : // engine relies on when the analyzer and DuckDB disagree on shape.
     359            1 : TEST_F(ArrowToBqTest, ColumnCountMismatchReturnsInvalidArgument) {
     360            1 :   schema::TableSchema s;
     361            1 :   schema::ColumnSchema a;
     362            1 :   a.name = "a";
     363            1 :   a.type = schema::ColumnType::kInt64;
     364            1 :   schema::ColumnSchema b;
     365            1 :   b.name = "b";
     366            1 :   b.type = schema::ColumnType::kInt64;
     367            1 :   s.columns = {a, b};
     368              : 
     369            1 :   ::duckdb_result result;
     370            1 :   ASSERT_EQ(::duckdb_query(conn_, "SELECT CAST(1 AS BIGINT)", &result),
     371            1 :             ::DuckDBSuccess);
     372            1 :   ::duckdb_data_chunk chunk = ::duckdb_fetch_chunk(result);
     373            1 :   ASSERT_NE(chunk, nullptr);
     374            1 :   auto row = ChunkRowToCells(chunk, 0, s);
     375            1 :   EXPECT_FALSE(row.ok());
     376            1 :   EXPECT_EQ(row.status().code(), absl::StatusCode::kInvalidArgument);
     377            1 :   ::duckdb_destroy_data_chunk(&chunk);
     378            1 :   ::duckdb_destroy_result(&result);
     379            1 : }
     380              : 
     381              : }  // namespace
     382              : }  // namespace arrow_to_bq
     383              : }  // namespace duckdb
     384              : }  // namespace engine
     385              : }  // namespace backend
     386              : }  // namespace bigquery_emulator
        

Generated by: LCOV version 2.0-1