Line data Source code
1 : #include "backend/engine/duckdb/transpiler/transpiler_test_fixture.h"
2 :
3 : namespace bigquery_emulator {
4 : namespace backend {
5 : namespace engine {
6 : namespace duckdb {
7 : namespace transpiler {
8 :
9 1 : TEST_F(TranspilerTest, EmitSampleScanSystemPercentFromSurface) {
10 : // BigQuery `TABLESAMPLE SYSTEM (10 PERCENT)` lowers to a
11 : // `ResolvedSampleScan` whose `method=SYSTEM`, `unit=PERCENT`,
12 : // and `size=10`. DuckDB's `USING SAMPLE 10 PERCENT (system)`
13 : // matches the BQ semantics (block-level sampling at the chosen
14 : // percent).
15 1 : const ::googlesql::ResolvedStatement* stmt =
16 1 : Analyze("SELECT * FROM people TABLESAMPLE SYSTEM (10 PERCENT)");
17 1 : if (stmt == nullptr) {
18 0 : GTEST_SKIP() << "analyzer rejected TABLESAMPLE SYSTEM -- skip";
19 0 : }
20 1 : const ::googlesql::ResolvedScan* scan = QueryInputScan(stmt);
21 1 : if (scan == nullptr ||
22 1 : scan->node_kind() != ::googlesql::RESOLVED_SAMPLE_SCAN) {
23 0 : GTEST_SKIP() << "analyzer did not produce ResolvedSampleScan -- skip";
24 0 : }
25 1 : TestTranspiler t;
26 1 : EXPECT_EQ(t.EmitSampleScan(scan->GetAs<::googlesql::ResolvedSampleScan>()),
27 1 : "SELECT * FROM (SELECT \"id\", \"name\" FROM \"people\") "
28 1 : "USING SAMPLE 10 PERCENT (system)");
29 1 : }
30 :
31 : // Helper: synthesize a `ResolvedSampleScan` directly so each
32 : // emit-shape (method + unit + optional repeatable/weight/stratify)
33 : // can be exercised without driving the analyzer through the
34 : // (sometimes BQ-only) surface SQL forms. The input is a fresh
35 : // `ResolvedSingleRowScan` so the wrapped child scan always emits
36 : // `SELECT 1`. Callers transfer ownership of the size / repeatable
37 : // / weight / partition_by expressions through `std::move`. Returns
38 : // nullptr only when the size expression is missing (a malformed
39 : // SampleScan the analyzer would never produce).
40 : struct TestSampleScanArgs {
41 : std::string method;
42 : ::googlesql::ResolvedSampleScan::SampleUnit unit =
43 : ::googlesql::ResolvedSampleScan::PERCENT;
44 : std::unique_ptr<const ::googlesql::ResolvedExpr> size{};
45 : std::unique_ptr<const ::googlesql::ResolvedExpr> repeatable{};
46 : std::unique_ptr<const ::googlesql::ResolvedColumnHolder> weight{};
47 : std::vector<std::unique_ptr<const ::googlesql::ResolvedExpr>> partition_by{};
48 : };
49 : std::unique_ptr<::googlesql::ResolvedSampleScan> MakeTestSampleScan(
50 12 : TestSampleScanArgs args) {
51 12 : if (args.size == nullptr) return nullptr;
52 12 : return ::googlesql::MakeResolvedSampleScan(
53 12 : /*column_list=*/{},
54 12 : ::googlesql::MakeResolvedSingleRowScan(),
55 12 : args.method,
56 12 : std::move(args.size),
57 12 : args.unit,
58 12 : std::move(args.repeatable),
59 12 : std::move(args.weight),
60 12 : std::move(args.partition_by));
61 12 : }
62 :
63 1 : TEST_F(TranspilerTest, EmitSampleScanBernoulliPercentDirect) {
64 : // BERNOULLI sampling over PERCENT is the second DuckDB
65 : // method/unit combination the plan calls out. Direct
66 : // construction sidesteps any analyzer-surface variability around
67 : // method names other than SYSTEM. The expected SQL pins the
68 : // DuckDB shape `USING SAMPLE <n> PERCENT (bernoulli)`.
69 1 : TestSampleScanArgs args;
70 1 : args.method = "BERNOULLI";
71 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
72 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(25));
73 1 : auto sample = MakeTestSampleScan(std::move(args));
74 1 : ASSERT_NE(sample, nullptr);
75 1 : TestTranspiler t;
76 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()),
77 1 : "SELECT * FROM (SELECT 1) USING SAMPLE 25 PERCENT (bernoulli)");
78 1 : }
79 :
80 1 : TEST_F(TranspilerTest, EmitSampleScanReservoirRowsDirect) {
81 : // RESERVOIR over ROWS. DuckDB picks reservoir sampling to hit an
82 : // exact row count, matching the BQ `RESERVOIR` semantics for
83 : // ROWS-shape sampling. We construct directly so the assertion
84 : // does not depend on the BQ surface accepting the `RESERVOIR
85 : // (50 ROWS)` form.
86 1 : TestSampleScanArgs args;
87 1 : args.method = "RESERVOIR";
88 1 : args.unit = ::googlesql::ResolvedSampleScan::ROWS;
89 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(50));
90 1 : auto sample = MakeTestSampleScan(std::move(args));
91 1 : ASSERT_NE(sample, nullptr);
92 1 : TestTranspiler t;
93 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()),
94 1 : "SELECT * FROM (SELECT 1) USING SAMPLE 50 ROWS (reservoir)");
95 1 : }
96 :
97 1 : TEST_F(TranspilerTest, EmitSampleScanSystemPercentDirect) {
98 : // SYSTEM over PERCENT through direct construction, mirroring the
99 : // surface-driven SYSTEM test so a future analyzer-side rewrite
100 : // of TABLESAMPLE leaves the direct-construction assertion as a
101 : // stable contract.
102 1 : TestSampleScanArgs args;
103 1 : args.method = "SYSTEM";
104 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
105 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(5));
106 1 : auto sample = MakeTestSampleScan(std::move(args));
107 1 : ASSERT_NE(sample, nullptr);
108 1 : TestTranspiler t;
109 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()),
110 1 : "SELECT * FROM (SELECT 1) USING SAMPLE 5 PERCENT (system)");
111 1 : }
112 :
113 1 : TEST_F(TranspilerTest, EmitSampleScanReservoirPercentMismatchFallsBack) {
114 : // RESERVOIR with PERCENT does not have a clean DuckDB analog --
115 : // reservoir sampling targets a specific row count -- so we fall
116 : // back rather than emit `USING SAMPLE N PERCENT (reservoir)`,
117 : // which DuckDB rejects at parse time.
118 1 : TestSampleScanArgs args;
119 1 : args.method = "RESERVOIR";
120 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
121 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
122 1 : auto sample = MakeTestSampleScan(std::move(args));
123 1 : ASSERT_NE(sample, nullptr);
124 1 : TestTranspiler t;
125 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
126 1 : }
127 :
128 1 : TEST_F(TranspilerTest, EmitSampleScanSystemRowsMismatchFallsBack) {
129 : // SYSTEM with ROWS has no DuckDB equivalent (system sampling is
130 : // a percent-form block sampler). Bail so the engine surfaces
131 : // UNIMPLEMENTED for the whole query.
132 1 : TestSampleScanArgs args;
133 1 : args.method = "SYSTEM";
134 1 : args.unit = ::googlesql::ResolvedSampleScan::ROWS;
135 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(100));
136 1 : auto sample = MakeTestSampleScan(std::move(args));
137 1 : ASSERT_NE(sample, nullptr);
138 1 : TestTranspiler t;
139 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
140 1 : }
141 :
142 1 : TEST_F(TranspilerTest, EmitSampleScanUnknownMethodFallsBack) {
143 : // Methods outside the {SYSTEM, BERNOULLI, RESERVOIR} matrix do
144 : // not have a DuckDB analog. The emit falls back rather than
145 : // emitting `USING SAMPLE ... (other)`, which DuckDB rejects.
146 1 : TestSampleScanArgs args;
147 1 : args.method = "OTHER";
148 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
149 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
150 1 : auto sample = MakeTestSampleScan(std::move(args));
151 1 : ASSERT_NE(sample, nullptr);
152 1 : TestTranspiler t;
153 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
154 1 : }
155 :
156 1 : TEST_F(TranspilerTest, EmitSampleScanWithRepeatableSeed) {
157 : // DuckDB's `REPEATABLE (<seed>)` clause pins the PRNG for
158 : // deterministic sampling; the transpiler forwards the seed
159 : // expression verbatim.
160 1 : TestSampleScanArgs args;
161 1 : args.method = "SYSTEM";
162 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
163 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
164 1 : args.repeatable =
165 1 : ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(42));
166 1 : auto sample = MakeTestSampleScan(std::move(args));
167 1 : ASSERT_NE(sample, nullptr);
168 1 : TestTranspiler t;
169 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()),
170 1 : "SELECT * FROM (SELECT 1) USING SAMPLE 10 PERCENT (system, 42)");
171 1 : }
172 :
173 1 : TEST_F(TranspilerTest, EmitSampleScanWithWeightColumnFallsBack) {
174 : // BigQuery `WITH WEIGHT <col>` lowers to a `weight_column` on
175 : // the SampleScan. DuckDB has no native weighted-sampling
176 : // keyword on `USING SAMPLE`, so we fall back. The test uses a
177 : // synthetic ResolvedColumn for the weight column so the
178 : // assertion does not depend on a particular surface that exposes
179 : // weighted sampling.
180 1 : TestSampleScanArgs args;
181 1 : args.method = "SYSTEM";
182 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
183 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
184 1 : ::googlesql::ResolvedColumn weight_col(
185 1 : /*column_id=*/1,
186 1 : /*table_name=*/::googlesql::IdString::MakeGlobal("$sample"),
187 1 : /*name=*/::googlesql::IdString::MakeGlobal("w"),
188 1 : type_factory_->get_double());
189 1 : args.weight = ::googlesql::MakeResolvedColumnHolder(weight_col);
190 1 : auto sample = MakeTestSampleScan(std::move(args));
191 1 : ASSERT_NE(sample, nullptr);
192 1 : TestTranspiler t;
193 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
194 1 : }
195 :
196 1 : TEST_F(TranspilerTest, EmitSampleScanWithStratifyFallsBack) {
197 : // BigQuery STRATIFY-BY surface populates `partition_by_list`.
198 : // DuckDB's `USING SAMPLE` has no per-partition sampling clause,
199 : // so we fall back. We push one stratify expression onto the
200 : // list (a literal so the fallback assertion is about the list
201 : // being non-empty, not about a sub-expression failure).
202 1 : TestSampleScanArgs args;
203 1 : args.method = "SYSTEM";
204 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
205 1 : args.size = ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
206 1 : args.partition_by.push_back(
207 1 : ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(1)));
208 1 : auto sample = MakeTestSampleScan(std::move(args));
209 1 : ASSERT_NE(sample, nullptr);
210 1 : TestTranspiler t;
211 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
212 1 : }
213 :
214 1 : TEST_F(TranspilerTest, EmitSampleScanUnloweredSizeFallsBack) {
215 : // A size expression we cannot lower (an untyped parameter)
216 : // propagates "" through `EmitExpr`; the SampleScan emit must
217 : // then return "" rather than emit `USING SAMPLE PERCENT (...)`.
218 1 : TestSampleScanArgs args;
219 1 : args.method = "SYSTEM";
220 1 : args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
221 1 : args.size = ::googlesql::MakeResolvedParameter(type_factory_->get_int64(),
222 1 : /*name=*/"n",
223 1 : /*position=*/0,
224 1 : /*is_untyped=*/true);
225 1 : auto sample = MakeTestSampleScan(std::move(args));
226 1 : ASSERT_NE(sample, nullptr);
227 1 : TestTranspiler t;
228 1 : EXPECT_EQ(t.EmitSampleScan(sample.get()), "");
229 1 : }
230 :
231 1 : TEST_F(TranspilerTest, EmitSampleScanPercentVsRowsContrast) {
232 : // Execution-style contrast: PERCENT and ROWS produce different
233 : // DuckDB shapes for the same numeric value. We assert on the
234 : // surface forms so a regression in the unit selector surfaces
235 : // here. Both methods are direct-construction so we can pin the
236 : // exact emit shape regardless of analyzer-side rewrites.
237 1 : TestSampleScanArgs percent_args;
238 1 : percent_args.method = "BERNOULLI";
239 1 : percent_args.unit = ::googlesql::ResolvedSampleScan::PERCENT;
240 1 : percent_args.size =
241 1 : ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
242 1 : auto percent_sample = MakeTestSampleScan(std::move(percent_args));
243 1 : ASSERT_NE(percent_sample, nullptr);
244 1 : TestSampleScanArgs rows_args;
245 1 : rows_args.method = "RESERVOIR";
246 1 : rows_args.unit = ::googlesql::ResolvedSampleScan::ROWS;
247 1 : rows_args.size =
248 1 : ::googlesql::MakeResolvedLiteral(::googlesql::Value::Int64(10));
249 1 : auto rows_sample = MakeTestSampleScan(std::move(rows_args));
250 1 : ASSERT_NE(rows_sample, nullptr);
251 1 : TestTranspiler t_percent;
252 1 : TestTranspiler t_rows;
253 1 : std::string percent_sql = t_percent.EmitSampleScan(percent_sample.get());
254 1 : std::string rows_sql = t_rows.EmitSampleScan(rows_sample.get());
255 1 : EXPECT_NE(percent_sql, rows_sql);
256 1 : EXPECT_NE(percent_sql.find(" PERCENT "), std::string::npos);
257 1 : EXPECT_NE(rows_sql.find(" ROWS "), std::string::npos);
258 1 : }
259 :
260 : // --- ResolvedWithScan / ResolvedWithRefScan ----------------------------
261 : //
262 : // `docs/ENGINE_POLICY.md` Family 1. These tests pin the
263 : // CTE emit shape end-to-end (`Transpile(stmt)` from a real
264 : // `AnalyzeStatement` output) so a regression that changes the
265 : // CTE-side anchor naming or the ref-scan-side rename surfaces as a
266 : // string diff here. The CTE body projects each column to a
267 : // positional anchor (`_cte_<idx>`) so per-reference name
268 : // collisions across multiple `ResolvedWithRefScan`s cannot leak;
269 : // `EmitWithRefScan` renames the anchor back to the analyzer's
270 : // per-reference column names.
271 :
272 : } // namespace transpiler
273 : } // namespace duckdb
274 : } // namespace engine
275 : } // namespace backend
276 : } // namespace bigquery_emulator
|