LCOV - code coverage report
Current view: top level - backend/engine/duckdb/transpiler - transpiler_emit_sample_test.cc (source / functions) Coverage Total Hit
Test: _coverage_report.dat Lines: 97.6 % 168 164
Test Date: 2026-07-02 21:01:18 Functions: 100.0 % 13 13

            Line data    Source code
       1              : #include "backend/engine/duckdb/transpiler/transpiler_test_fixture.h"
       2              : 
       3              : namespace bigquery_emulator {
       4              : namespace backend {
       5              : namespace engine {
       6              : namespace duckdb {
       7              : namespace transpiler {
       8              : 
       9            1 : TEST_F(TranspilerTest, EmitSampleScanSystemPercentFromSurface) {
      10              :   // BigQuery `TABLESAMPLE SYSTEM (10 PERCENT)` lowers to a
      11              :   // `ResolvedSampleScan` whose `method=SYSTEM`, `unit=PERCENT`,
      12              :   // and `size=10`. DuckDB's `USING SAMPLE 10 PERCENT (system)`
      13              :   // matches the BQ semantics (block-level sampling at the chosen
      14              :   // percent).
      15            1 :   const ::googlesql::ResolvedStatement* stmt =
      16            1 :       Analyze("SELECT * FROM people TABLESAMPLE SYSTEM (10 PERCENT)");
      17            1 :   if (stmt == nullptr) {
      18            0 :     GTEST_SKIP() << "analyzer rejected TABLESAMPLE SYSTEM -- skip";
      19            0 :   }
      20            1 :   const ::googlesql::ResolvedScan* scan = QueryInputScan(stmt);
      21            1 :   if (scan == nullptr ||
      22            1 :       scan->node_kind() != ::googlesql::RESOLVED_SAMPLE_SCAN) {
      23            0 :     GTEST_SKIP() << "analyzer did not produce ResolvedSampleScan -- skip";
      24            0 :   }
      25            1 :   TestTranspiler t;
      26            1 :   EXPECT_EQ(t.EmitSampleScan(scan->GetAs<::googlesql::ResolvedSampleScan>()),
      27            1 :             "SELECT * FROM (SELECT \"id\", \"name\" FROM \"people\") "
      28            1 :             "USING SAMPLE 10 PERCENT (system)");
      29            1 : }
      30              : 
      31              : // Helper: synthesize a `ResolvedSampleScan` directly so each
      32              : // emit-shape (method + unit + optional repeatable/weight/stratify)
      33              : // can be exercised without driving the analyzer through the
      34              : // (sometimes BQ-only) surface SQL forms. The input is a fresh
      35              : // `ResolvedSingleRowScan` so the wrapped child scan always emits
      36              : // `SELECT 1`. Callers transfer ownership of the size / repeatable
      37              : // / weight / partition_by expressions through `std::move`. Returns
      38              : // nullptr only when the size expression is missing (a malformed
      39              : // SampleScan the analyzer would never produce).
      40              : struct TestSampleScanArgs {
      41              :   std::string method;
      42              :   ::googlesql::ResolvedSampleScan::SampleUnit unit =
      43              :       ::googlesql::ResolvedSampleScan::PERCENT;
      44              :   std::unique_ptr<const ::googlesql::ResolvedExpr> size{};
      45              :   std::unique_ptr<const ::googlesql::ResolvedExpr> repeatable{};
      46              :   std::unique_ptr<const ::googlesql::ResolvedColumnHolder> weight{};
      47              :   std::vector<std::unique_ptr<const ::googlesql::ResolvedExpr>> partition_by{};
      48              : };
      49              : std::unique_ptr<::googlesql::ResolvedSampleScan> MakeTestSampleScan(
      50           12 :     TestSampleScanArgs args) {
      51           12 :   if (args.size == nullptr) return nullptr;
      52           12 :   return ::googlesql::MakeResolvedSampleScan(
      53           12 :       /*column_list=*/{},
      54           12 :       ::googlesql::MakeResolvedSingleRowScan(),
      55           12 :       args.method,
      56           12 :       std::move(args.size),
      57           12 :       args.unit,
      58           12 :       std::move(args.repeatable),
      59           12 :       std::move(args.weight),
      60           12 :       std::move(args.partition_by));
      61           12 : }
      62              : 
      63            1 : TEST_F(TranspilerTest, EmitSampleScanBernoulliPercentDirect) {
      64              :   // BERNOULLI sampling over PERCENT is the second DuckDB
      65              :   // method/unit combination the plan calls out. Direct
      66              :   // construction sidesteps any analyzer-surface variability around
      67              :   // method names other than SYSTEM. The expected SQL pins the
      68              :   // DuckDB shape `USING SAMPLE <n> PERCENT (bernoulli)`.
      69            1 :   TestSampleScanArgs args;
      70            1 :   args.method = "BERNOULLI";
      71            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
      72            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(25));
      73            1 :   auto sample = MakeTestSampleScan(std::move(args));
      74            1 :   ASSERT_NE(sample, nullptr);
      75            1 :   TestTranspiler t;
      76            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()),
      77            1 :             "SELECT * FROM (SELECT 1) USING SAMPLE 25 PERCENT (bernoulli)");
      78            1 : }
      79              : 
      80            1 : TEST_F(TranspilerTest, EmitSampleScanReservoirRowsDirect) {
      81              :   // RESERVOIR over ROWS. DuckDB picks reservoir sampling to hit an
      82              :   // exact row count, matching the BQ `RESERVOIR` semantics for
      83              :   // ROWS-shape sampling. We construct directly so the assertion
      84              :   // does not depend on the BQ surface accepting the `RESERVOIR
      85              :   // (50 ROWS)` form.
      86            1 :   TestSampleScanArgs args;
      87            1 :   args.method = "RESERVOIR";
      88            1 :   args.unit = ::googlesql::ResolvedSampleScan::ROWS;
      89            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(50));
      90            1 :   auto sample = MakeTestSampleScan(std::move(args));
      91            1 :   ASSERT_NE(sample, nullptr);
      92            1 :   TestTranspiler t;
      93            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()),
      94            1 :             "SELECT * FROM (SELECT 1) USING SAMPLE 50 ROWS (reservoir)");
      95            1 : }
      96              : 
      97            1 : TEST_F(TranspilerTest, EmitSampleScanSystemPercentDirect) {
      98              :   // SYSTEM over PERCENT through direct construction, mirroring the
      99              :   // surface-driven SYSTEM test so a future analyzer-side rewrite
     100              :   // of TABLESAMPLE leaves the direct-construction assertion as a
     101              :   // stable contract.
     102            1 :   TestSampleScanArgs args;
     103            1 :   args.method = "SYSTEM";
     104            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     105            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(5));
     106            1 :   auto sample = MakeTestSampleScan(std::move(args));
     107            1 :   ASSERT_NE(sample, nullptr);
     108            1 :   TestTranspiler t;
     109            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()),
     110            1 :             "SELECT * FROM (SELECT 1) USING SAMPLE 5 PERCENT (system)");
     111            1 : }
     112              : 
     113            1 : TEST_F(TranspilerTest, EmitSampleScanReservoirPercentMismatchFallsBack) {
     114              :   // RESERVOIR with PERCENT does not have a clean DuckDB analog --
     115              :   // reservoir sampling targets a specific row count -- so we fall
     116              :   // back rather than emit `USING SAMPLE N PERCENT (reservoir)`,
     117              :   // which DuckDB rejects at parse time.
     118            1 :   TestSampleScanArgs args;
     119            1 :   args.method = "RESERVOIR";
     120            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     121            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     122            1 :   auto sample = MakeTestSampleScan(std::move(args));
     123            1 :   ASSERT_NE(sample, nullptr);
     124            1 :   TestTranspiler t;
     125            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
     126            1 : }
     127              : 
     128            1 : TEST_F(TranspilerTest, EmitSampleScanSystemRowsMismatchFallsBack) {
     129              :   // SYSTEM with ROWS has no DuckDB equivalent (system sampling is
     130              :   // a percent-form block sampler). Bail so the engine surfaces
     131              :   // UNIMPLEMENTED for the whole query.
     132            1 :   TestSampleScanArgs args;
     133            1 :   args.method = "SYSTEM";
     134            1 :   args.unit = ::googlesql::ResolvedSampleScan::ROWS;
     135            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(100));
     136            1 :   auto sample = MakeTestSampleScan(std::move(args));
     137            1 :   ASSERT_NE(sample, nullptr);
     138            1 :   TestTranspiler t;
     139            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
     140            1 : }
     141              : 
     142            1 : TEST_F(TranspilerTest, EmitSampleScanUnknownMethodFallsBack) {
     143              :   // Methods outside the {SYSTEM, BERNOULLI, RESERVOIR} matrix do
     144              :   // not have a DuckDB analog. The emit falls back rather than
     145              :   // emitting `USING SAMPLE ... (other)`, which DuckDB rejects.
     146            1 :   TestSampleScanArgs args;
     147            1 :   args.method = "OTHER";
     148            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     149            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     150            1 :   auto sample = MakeTestSampleScan(std::move(args));
     151            1 :   ASSERT_NE(sample, nullptr);
     152            1 :   TestTranspiler t;
     153            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
     154            1 : }
     155              : 
     156            1 : TEST_F(TranspilerTest, EmitSampleScanWithRepeatableSeed) {
     157              :   // DuckDB's `REPEATABLE (<seed>)` clause pins the PRNG for
     158              :   // deterministic sampling; the transpiler forwards the seed
     159              :   // expression verbatim.
     160            1 :   TestSampleScanArgs args;
     161            1 :   args.method = "SYSTEM";
     162            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     163            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     164            1 :   args.repeatable =
     165            1 :       ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(42));
     166            1 :   auto sample = MakeTestSampleScan(std::move(args));
     167            1 :   ASSERT_NE(sample, nullptr);
     168            1 :   TestTranspiler t;
     169            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()),
     170            1 :             "SELECT * FROM (SELECT 1) USING SAMPLE 10 PERCENT (system, 42)");
     171            1 : }
     172              : 
     173            1 : TEST_F(TranspilerTest, EmitSampleScanWithWeightColumnFallsBack) {
     174              :   // BigQuery `WITH WEIGHT <col>` lowers to a `weight_column` on
     175              :   // the SampleScan. DuckDB has no native weighted-sampling
     176              :   // keyword on `USING SAMPLE`, so we fall back. The test uses a
     177              :   // synthetic ResolvedColumn for the weight column so the
     178              :   // assertion does not depend on a particular surface that exposes
     179              :   // weighted sampling.
     180            1 :   TestSampleScanArgs args;
     181            1 :   args.method = "SYSTEM";
     182            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     183            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     184            1 :   ::googlesql::ResolvedColumn weight_col(
     185            1 :       /*column_id=*/1,
     186            1 :       /*table_name=*/::googlesql::IdString::MakeGlobal("$sample"),
     187            1 :       /*name=*/::googlesql::IdString::MakeGlobal("w"),
     188            1 :       type_factory_->get_double());
     189            1 :   args.weight = ::googlesql::MakeResolvedColumnHolder(weight_col);
     190            1 :   auto sample = MakeTestSampleScan(std::move(args));
     191            1 :   ASSERT_NE(sample, nullptr);
     192            1 :   TestTranspiler t;
     193            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
     194            1 : }
     195              : 
     196            1 : TEST_F(TranspilerTest, EmitSampleScanWithStratifyFallsBack) {
     197              :   // BigQuery STRATIFY-BY surface populates `partition_by_list`.
     198              :   // DuckDB's `USING SAMPLE` has no per-partition sampling clause,
     199              :   // so we fall back. We push one stratify expression onto the
     200              :   // list (a literal so the fallback assertion is about the list
     201              :   // being non-empty, not about a sub-expression failure).
     202            1 :   TestSampleScanArgs args;
     203            1 :   args.method = "SYSTEM";
     204            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     205            1 :   args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     206            1 :   args.partition_by.push_back(
     207            1 :       ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(1)));
     208            1 :   auto sample = MakeTestSampleScan(std::move(args));
     209            1 :   ASSERT_NE(sample, nullptr);
     210            1 :   TestTranspiler t;
     211            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
     212            1 : }
     213              : 
     214            1 : TEST_F(TranspilerTest, EmitSampleScanUnloweredSizeFallsBack) {
     215              :   // A size expression we cannot lower (an untyped parameter)
     216              :   // propagates "" through `EmitExpr`; the SampleScan emit must
     217              :   // then return "" rather than emit `USING SAMPLE  PERCENT (...)`.
     218            1 :   TestSampleScanArgs args;
     219            1 :   args.method = "SYSTEM";
     220            1 :   args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     221            1 :   args.size = ::googlesql::MakeResolvedParameter(type_factory_->get_int64(),
     222            1 :                                                  /*name=*/"n",
     223            1 :                                                  /*position=*/0,
     224            1 :                                                  /*is_untyped=*/true);
     225            1 :   auto sample = MakeTestSampleScan(std::move(args));
     226            1 :   ASSERT_NE(sample, nullptr);
     227            1 :   TestTranspiler t;
     228            1 :   EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
     229            1 : }
     230              : 
     231            1 : TEST_F(TranspilerTest, EmitSampleScanPercentVsRowsContrast) {
     232              :   // Execution-style contrast: PERCENT and ROWS produce different
     233              :   // DuckDB shapes for the same numeric value. We assert on the
     234              :   // surface forms so a regression in the unit selector surfaces
     235              :   // here. Both methods are direct-construction so we can pin the
     236              :   // exact emit shape regardless of analyzer-side rewrites.
     237            1 :   TestSampleScanArgs percent_args;
     238            1 :   percent_args.method = "BERNOULLI";
     239            1 :   percent_args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
     240            1 :   percent_args.size =
     241            1 :       ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     242            1 :   auto percent_sample = MakeTestSampleScan(std::move(percent_args));
     243            1 :   ASSERT_NE(percent_sample, nullptr);
     244            1 :   TestSampleScanArgs rows_args;
     245            1 :   rows_args.method = "RESERVOIR";
     246            1 :   rows_args.unit = ::googlesql::ResolvedSampleScan::ROWS;
     247            1 :   rows_args.size =
     248            1 :       ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
     249            1 :   auto rows_sample = MakeTestSampleScan(std::move(rows_args));
     250            1 :   ASSERT_NE(rows_sample, nullptr);
     251            1 :   TestTranspiler t_percent;
     252            1 :   TestTranspiler t_rows;
     253            1 :   std::string percent_sql = t_percent.EmitSampleScan(percent_sample.get());
     254            1 :   std::string rows_sql = t_rows.EmitSampleScan(rows_sample.get());
     255            1 :   EXPECT_NE(percent_sql, rows_sql);
     256            1 :   EXPECT_NE(percent_sql.find(" PERCENT "), std::string::npos);
     257            1 :   EXPECT_NE(rows_sql.find(" ROWS "), std::string::npos);
     258            1 : }
     259              : 
     260              : // --- ResolvedWithScan / ResolvedWithRefScan ----------------------------
     261              : //
     262              : // `docs/ENGINE_POLICY.md` Family 1. These tests pin the
     263              : // CTE emit shape end-to-end (`Transpile(stmt)` from a real
     264              : // `AnalyzeStatement` output) so a regression that changes the
     265              : // CTE-side anchor naming or the ref-scan-side rename surfaces as a
     266              : // string diff here. The CTE body projects each column to a
     267              : // positional anchor (`_cte_<idx>`) so per-reference name
     268              : // collisions across multiple `ResolvedWithRefScan`s cannot leak;
     269              : // `EmitWithRefScan` renames the anchor back to the analyzer's
     270              : // per-reference column names.
     271              : 
     272              : }  // namespace transpiler
     273              : }  // namespace duckdb
     274              : }  // namespace engine
     275              : }  // namespace backend
     276              : }  // namespace bigquery_emulator
        

Generated by: LCOV version 2.0-1