Line data Source code
1 : #ifndef BIGQUERY_EMULATOR_BACKEND_ENGINE_DUCKDB_TRANSPILER_TRANSPILER_H_
2 : #define BIGQUERY_EMULATOR_BACKEND_ENGINE_DUCKDB_TRANSPILER_TRANSPILER_H_
3 :
4 : // DuckDB transpiler.
5 : //
6 : // `Transpiler` walks a GoogleSQL `ResolvedAST` produced by
7 : // `googlesql::AnalyzeStatement` (see
8 : // `backend/engine/duckdb/duckdb_engine.cc`) and
9 : // emits a DuckDB-flavored SQL string the DuckDB C++ client can
10 : // execute. The visitor's emit set grows one shape at a time per the
11 : // `SHAPE_TRACKER.md` file in this directory, mirroring the
12 : // cloud-spanner-emulator shape-tracker approach.
13 : //
14 : // The class inherits from `googlesql::ResolvedASTVisitor` so future
15 : // plans can either override `VisitResolved*` directly (for nodes
16 : // that need to recurse into children with non-default ordering) or
17 : // route through the per-shape `Emit*` helpers declared here. The
18 : // SHAPE_TRACKER rows whose disposition lowers through DuckDB
19 : // (`duckdb_native` / `duckdb_rewrite` / `duckdb_udf`) correspond to
20 : // an `Emit*` that returns DuckDB-flavored SQL; rows on the other
21 : // dispositions (`semantic_executor` / `control_op` / `local_stub` /
22 : // `unsupported`) bottom out at the empty string and the route
23 : // classifier in `backend/engine/coordinator/` dispatches them to
24 : // the matching executor (or surfaces a BigQuery-shaped
25 : // `UNIMPLEMENTED` for `unsupported`).
26 : //
27 : // Implemented baseline (top-level SELECT, the `transpiler-select-core`
28 : // plan):
29 : //
30 : // * `EmitQueryStmt` lowers the root statement: it walks `query()` for
31 : // the relational tree, then renames each physical column through
32 : // the `output_column_list()` mapping so the user-visible alias
33 : // lands on the outermost SELECT.
34 : // * `EmitProjectScan` wraps `input_scan()` and projects the
35 : // `column_list()` schema, picking each column out of the matching
36 : // `expr_list()` entry (computed) or referencing it by name
37 : // (pass-through). When `expr_list()` is empty and `column_list()`
38 : // is a permutation of `input_scan()->column_list()` by column id,
39 : // the wrap is elided so the redundant
40 : // `SELECT * FROM (SELECT * FROM ...)` layer the analyzer always
41 : // inserts above a TableScan does not stack onto every scan emit.
42 : // * `EmitSingleRowScan` emits `SELECT 1` so a `ProjectScan` over
43 : // `SingleRowScan` (the analyzer's "no FROM clause" shape) composes
44 : // as a derived table.
45 : // * `EmitComputedColumn` lowers `column := expr` to
46 : // `<expr> AS "<resolved-column-name>"`.
47 : // * `EmitOutputColumn` renders the final alias mapping
48 : // (`<column-name> AS <output-name>`, collapsed when the names
49 : // match).
50 : //
51 : // Earlier plans (`transpiler-emit-scans` etc.) cover the other
52 : // `done` rows in `SHAPE_TRACKER.md` (TableScan, FilterScan, JoinScan,
53 : // AggregateScan, OrderByScan, LimitOffsetScan, AnalyticScan, ...).
54 : //
55 : // Set operations + sampling (`transpiler-setops-sample`) layer two
56 : // more scan kinds onto the dispatcher:
57 : //
58 : // * `EmitSetOperationScan` lowers `UNION ALL`, `UNION DISTINCT`,
59 : // `INTERSECT DISTINCT`, `EXCEPT DISTINCT` (and the DuckDB-native
60 : // `INTERSECT ALL` / `EXCEPT ALL` extensions that match standard
61 : // SQL bag semantics) by routing each `ResolvedSetOperationItem`
62 : // through a private `EmitSetOperationItem` helper. The helper
63 : // honors BigQuery's column-name propagation by projecting each
64 : // item's `output_column_list(i)` onto the parent's
65 : // `column_list(i)` name.
66 : // * `EmitSampleScan` lowers BigQuery `TABLESAMPLE` to DuckDB's
67 : // `USING SAMPLE` for the `SYSTEM` / `BERNOULLI` (PERCENT) and
68 : // `RESERVOIR` (ROWS) shapes whose semantics match. Weight columns,
69 : // stratify (`PARTITION BY`) lists, and repeatable seeds stay on
70 : // the empty-string fallback for now.
71 : //
72 : // Expression-core (`transpiler-expression-core`) layers four scalar
73 : // shapes on top of the baseline:
74 : //
75 : // * `EmitParameter` lowers `@<name>` and positional `?` parameters
76 : // to DuckDB `$N` placeholders and accumulates a `ParameterRef`
77 : // per unique slot in `parameter_order()` so the engine can copy
78 : // values across in bind order.
79 : // * `EmitCast` lowers `CAST(<expr> AS <type>)` (and `SAFE_CAST(...)`
80 : // -> `TRY_CAST(...)`) for every type kind that `types.h` exposes a
81 : // first-class DuckDB analog for. Format / time-zone / extended /
82 : // type-modifier shapes stay on the fallback.
83 : // * `EmitFunctionArgument` routes a `ResolvedFunctionArgument`
84 : // through the `expr()` slot it wraps; non-expression slots
85 : // (scan / model / lambda / ...) stay on the fallback.
86 : // * `EmitWithExpr` lowers `WITH(<a> AS <expr>, ...) <body>` to a
87 : // DuckDB scalar subquery that exposes each binding as a column on
88 : // an inner SELECT and then projects the body off it, preserving
89 : // GoogleSQL's once-per-row evaluation semantics.
90 : //
91 : // `Transpile` is the public entry point: it accepts the root
92 : // `ResolvedQueryStmt` (or any subtree) and returns the lowered
93 : // DuckDB SQL. `DuckDBEngine::ExecuteQuery` (in `duckdb_engine.cc`)
94 : // treats an empty transpilation as "not yet supported" and surfaces
95 : // UNIMPLEMENTED to the gateway through the engine's empty-string
96 : // contract.
97 : //
98 : // Threading: a `Transpiler` instance is *not* thread-safe — it
99 : // carries a per-traversal accumulator. The engine constructs a
100 : // fresh `Transpiler` per `ExecuteQuery` call so cross-query state
101 : // is not an issue.
102 :
103 : #include <string>
104 : #include <vector>
105 :
106 : #include "absl/container/flat_hash_map.h"
107 : #include "absl/strings/string_view.h"
108 : #include "googlesql/resolved_ast/resolved_ast.h"
109 : #include "googlesql/resolved_ast/resolved_ast_visitor.h"
110 : #include "googlesql/resolved_ast/resolved_node.h"
111 :
112 : namespace bigquery_emulator {
113 : namespace backend {
114 : namespace engine {
115 : namespace duckdb {
116 : namespace transpiler {
117 :
118 : class Transpiler : public ::googlesql::ResolvedASTVisitor {
119 : public:
120 : // Bind metadata for one DuckDB `$N` placeholder emitted by the
121 : // transpiler. The accumulator is exposed through
122 : // `parameter_order()` so the engine can copy GoogleSQL
123 : // `Value`s into DuckDB's bind buffer in placeholder-numerical
124 : // order.
125 : //
126 : // Exactly one of {`name`, `position`} carries the GoogleSQL
127 : // identity:
128 : // * Named parameter: `name` is the analyzer-lowercased
129 : // identifier (e.g. "@CustomerId" -> "customerid"); `position`
130 : // is 0.
131 : // * Positional parameter: `name` is empty; `position` is the
132 : // 1-based GoogleSQL position the analyzer assigned.
133 : // Either way the entry's index in `parameter_order()` is the
134 : // 0-based DuckDB placeholder slot (slot `i` is `$<i+1>`), so
135 : // multiple references to the same named parameter share one slot.
136 : struct ParameterRef {
137 : std::string name;
138 : int position = 0;
139 : };
140 :
141 : Transpiler();
142 : ~Transpiler() override;
143 :
144 : Transpiler(const Transpiler&) = delete;
145 : Transpiler& operator=(const Transpiler&) = delete;
146 :
147 : // Lower the resolved AST rooted at `node` (typically a
148 : // `ResolvedQueryStmt`) into a DuckDB SQL string.
149 : //
150 : // Returns the DuckDB SQL for any shape covered by the per-shape
151 : // `Emit*` methods below; an empty string means "transpiler cannot
152 : // lower this shape yet" and callers treat that as a signal for
153 : // the engine to surface UNIMPLEMENTED.
154 : //
155 : // Safe to call with a null `node`; returns the empty string in
156 : // that case.
157 : std::string Transpile(const ::googlesql::ResolvedNode* node);
158 :
159 : // Lower a relational `ResolvedScan` subtree to DuckDB SQL. Used by
160 : // CTAS and other control-op paths that already have a scan root.
161 : std::string TranspileScan(const ::googlesql::ResolvedScan* scan);
162 :
163 : // Lower CTAS / INSERT ... SELECT inner queries with the same
164 : // `query_output_column_names_` + outer projection path as
165 : // `EmitQueryStmt`, keyed on the analyzer's output column lists.
166 : std::string EmitCtasSelect(
167 : const ::googlesql::ResolvedCreateTableAsSelectStmt* stmt);
168 : std::string EmitInsertSelect(const ::googlesql::ResolvedInsertStmt* stmt);
169 :
170 : // Bind-order accumulator for the GoogleSQL parameters encountered
171 : // during the most recent traversal. Slot `i` (0-based) is the
172 : // parameter the transpiler emitted as `$<i+1>`. The engine reads
173 : // this back to copy values into DuckDB's bind buffer; named
174 : // parameters that appear multiple times in the SQL share a single
175 : // slot (see `EmitParameter`), so the vector contains one entry per
176 : // unique placeholder, not one per textual reference.
177 14 : const std::vector<ParameterRef>& parameter_order() const {
178 14 : return parameter_order_;
179 14 : }
180 :
181 : protected:
182 : // ---------------------------------------------------------------
183 : // Per-shape Emit hooks.
184 : //
185 : // Each hook returns the DuckDB SQL fragment for one ResolvedAST
186 : // node kind. Hooks for `duckdb_native` / `duckdb_rewrite` /
187 : // `duckdb_udf` rows in `SHAPE_TRACKER.md` emit DuckDB SQL; hooks
188 : // for the other dispositions (`semantic_executor` / `control_op`
189 : // / `local_stub` / `unsupported`) still return the empty string
190 : // and the route classifier dispatches the surrounding query to
191 : // the matching executor instead. Group them roughly by node
192 : // category so the file diff for "add scan emit" / "add expr emit"
193 : // stays focused.
194 : //
195 : // The list below is *not* exhaustive — it tracks the node kinds
196 : // for which DuckDB has a sensible analog. Node kinds whose
197 : // SHAPE_TRACKER disposition is owned by another executor (graph /
198 : // ML / pivot / DML / DDL / ...) get their disposition in the
199 : // tracker without an `Emit*` here; the visitor's `DefaultVisit`
200 : // falls through to "no DuckDB lowering" and the route classifier
201 : // routes the query elsewhere.
202 : // ---------------------------------------------------------------
203 :
204 : // Statements -----------------------------------------------------
205 : virtual std::string EmitQueryStmt(const ::googlesql::ResolvedQueryStmt* node);
206 :
207 : // Scans ----------------------------------------------------------
208 : virtual std::string EmitProjectScan(
209 : const ::googlesql::ResolvedProjectScan* node);
210 : virtual std::string EmitTableScan(const ::googlesql::ResolvedTableScan* node);
211 : virtual std::string EmitSingleRowScan(
212 : const ::googlesql::ResolvedSingleRowScan* node);
213 : virtual std::string EmitFilterScan(
214 : const ::googlesql::ResolvedFilterScan* node);
215 : virtual std::string EmitJoinScan(const ::googlesql::ResolvedJoinScan* node);
216 : virtual std::string EmitArrayScan(const ::googlesql::ResolvedArrayScan* node);
217 : virtual std::string EmitAggregateScan(
218 : const ::googlesql::ResolvedAggregateScan* node);
219 : virtual std::string EmitSetOperationScan(
220 : const ::googlesql::ResolvedSetOperationScan* node);
221 : virtual std::string EmitOrderByScan(
222 : const ::googlesql::ResolvedOrderByScan* node);
223 : virtual std::string EmitLimitOffsetScan(
224 : const ::googlesql::ResolvedLimitOffsetScan* node);
225 : virtual std::string EmitAnalyticScan(
226 : const ::googlesql::ResolvedAnalyticScan* node);
227 : virtual std::string EmitSampleScan(
228 : const ::googlesql::ResolvedSampleScan* node);
229 : virtual std::string EmitWithScan(const ::googlesql::ResolvedWithScan* node);
230 : virtual std::string EmitWithRefScan(
231 : const ::googlesql::ResolvedWithRefScan* node);
232 : // PIVOT / UNPIVOT lower through conditional-aggregation / UNION ALL
233 : // rewrites the BigQuery analyzer hands us as `ResolvedPivotScan`
234 : // and `ResolvedUnpivotScan`. See the per-shape comments on
235 : // `EmitPivotScan` and `EmitUnpivotScan` for the lowering rules.
236 : virtual std::string EmitPivotScan(const ::googlesql::ResolvedPivotScan* node);
237 : virtual std::string EmitUnpivotScan(
238 : const ::googlesql::ResolvedUnpivotScan* node);
239 : // Recursive CTE lowering. `EmitRecursiveScan` lowers the anchor +
240 : // recursive terms inside a `ResolvedWithEntry` whose
241 : // `with_subquery()` is a `ResolvedRecursiveScan` (i.e. the entry
242 : // is the recursive side of a `WITH RECURSIVE` block);
243 : // `EmitRecursiveRefScan` renames the per-CTE anchor names back to
244 : // the per-ref column names so the recursive arm's scans compose
245 : // like every other scan emit.
246 : virtual std::string EmitRecursiveScan(
247 : const ::googlesql::ResolvedRecursiveScan* node);
248 : virtual std::string EmitRecursiveRefScan(
249 : const ::googlesql::ResolvedRecursiveRefScan* node);
250 :
251 : // Expressions ----------------------------------------------------
252 : virtual std::string EmitLiteral(const ::googlesql::ResolvedLiteral* node);
253 : virtual std::string EmitParameter(const ::googlesql::ResolvedParameter* node);
254 : virtual std::string EmitColumnRef(const ::googlesql::ResolvedColumnRef* node);
255 : virtual std::string EmitFunctionCall(
256 : const ::googlesql::ResolvedFunctionCall* node);
257 : virtual std::string EmitAggregateFunctionCall(
258 : const ::googlesql::ResolvedAggregateFunctionCall* node);
259 : virtual std::string EmitAnalyticFunctionCall(
260 : const ::googlesql::ResolvedAnalyticFunctionCall* node);
261 : virtual std::string EmitCast(const ::googlesql::ResolvedCast* node);
262 : virtual std::string EmitMakeStruct(
263 : const ::googlesql::ResolvedMakeStruct* node);
264 : virtual std::string EmitGetStructField(
265 : const ::googlesql::ResolvedGetStructField* node);
266 : // BigQuery `<json>.<field>` lowers through DuckDB's `->` operator
267 : // when the result type is JSON (the common case for chained
268 : // `<json>.a.b` access) and through `->>` when the analyzer
269 : // resolves the return as a scalar (rare today; reserved for
270 : // analyzer-driven coercion). See the `EmitGetJsonField` body for
271 : // the deliberate choice between `->` / `->>` and
272 : // `json_extract` / `json_extract_string`.
273 : virtual std::string EmitGetJsonField(
274 : const ::googlesql::ResolvedGetJsonField* node);
275 : virtual std::string EmitSubqueryExpr(
276 : const ::googlesql::ResolvedSubqueryExpr* node);
277 : virtual std::string EmitWithExpr(const ::googlesql::ResolvedWithExpr* node);
278 :
279 : // Function-argument wrapper --------------------------------------
280 : // `ResolvedFunctionArgument` carries one of {expr, scan, model,
281 : // connection, descriptor, lambda, sequence, graph}. Today we
282 : // route through the `expr()` slot so callers that walk
283 : // `generic_argument_list` can treat a wrapped scalar argument as a
284 : // first-class expression. Every other slot stays on the
285 : // empty-string fallback (no caller proves the corresponding
286 : // function-argument syntax lowers cleanly to DuckDB yet).
287 : virtual std::string EmitFunctionArgument(
288 : const ::googlesql::ResolvedFunctionArgument* node);
289 :
290 : // Column / output shape ------------------------------------------
291 : virtual std::string EmitOutputColumn(
292 : const ::googlesql::ResolvedOutputColumn* node);
293 : virtual std::string EmitComputedColumn(
294 : const ::googlesql::ResolvedComputedColumn* node);
295 :
296 : private:
297 : // Dispatch a `ResolvedExpr` to the matching `Emit*` method based on
298 : // its `node_kind()`. Returns the empty string for any expression
299 : // kind whose `Emit*` still returns `""` -- the engine treats an
300 : // empty fragment the same way `Transpile()` does and surfaces
301 : // UNIMPLEMENTED to the gateway.
302 : std::string EmitExpr(const ::googlesql::ResolvedExpr* expr);
303 :
304 : // Dispatch a `ResolvedScan` to the matching `Emit*` method. Same
305 : // empty-string-as-fallback contract as `EmitExpr`.
306 : std::string EmitScan(const ::googlesql::ResolvedScan* scan);
307 :
308 : // Lower explicit `CROSS JOIN UNNEST(...) CROSS JOIN UNNEST(...)` shapes
309 : // the analyzer represents either as a multi-array `ResolvedArrayScan`
310 : // without `array_zip_mode`, or as a chain of nested single-array scans.
311 : std::string EmitUnnestCrossProductScan(
312 : const ::googlesql::ResolvedArrayScan* node);
313 :
314 : // Lower one `ResolvedWindowFrameExpr` (PRECEDING / CURRENT ROW /
315 : // FOLLOWING). Used by `EmitAnalyticScan` for the inner ROWS / RANGE
316 : // BETWEEN ... AND ... clause; returns the empty string when the
317 : // bound is malformed or when an offset bound carries an expression
318 : // we cannot lower yet, so the caller can propagate the empty-string
319 : // contract.
320 : std::string EmitFrameBound(const ::googlesql::ResolvedWindowFrameExpr* expr);
321 :
322 : // Per-clause helpers for `EmitAnalyticScan`. Each returns:
323 : // * `""` when the clause is legally absent (e.g. no PARTITION BY)
324 : // * `kAnalyticBail` when the shape is not yet lowerable and the
325 : // caller must abandon the analytic emit and let the engine
326 : // surface UNIMPLEMENTED.
327 : // * the SQL fragment otherwise.
328 : // The bail sentinel keeps the helpers' empty-string-as-fallback
329 : // contract from colliding with the legitimate "clause omitted"
330 : // value.
331 : static constexpr absl::string_view kAnalyticBail =
332 : "\x01"
333 : "bail";
334 : std::string BuildPartitionClause(
335 : const ::googlesql::ResolvedWindowPartitioning* p);
336 : std::string BuildOrderClause(const ::googlesql::ResolvedWindowOrdering* o);
337 : std::string BuildOrderClause(const ::googlesql::ResolvedWindowOrdering* o,
338 : bool bigquery_null_defaults,
339 : bool append_input_rn);
340 : std::string BuildFrameClause(const ::googlesql::ResolvedWindowFrame* wf);
341 : std::string BuildAnalyticProjection(
342 : const ::googlesql::ResolvedComputedColumnBase* col,
343 : absl::string_view partition_clause,
344 : absl::string_view order_clause);
345 : void CaptureAnalyticOutputOrder(
346 : const ::googlesql::ResolvedAnalyticFunctionGroup* group);
347 :
348 : // Lower one `ResolvedSetOperationItem` into a derived-table SELECT
349 : // that projects `parent->column_list()` in order. Each
350 : // `output_column_list[i]` names the column the item's child scan
351 : // exposes for the parent's `column_list(i)` slot, so we wrap the
352 : // child scan as `FROM (<scan>)` and project `"<output_i>" AS
353 : // "<parent_i>"` (collapsed when the names already match). Returns
354 : // the empty string when the child scan cannot lower or the
355 : // output/parent column counts disagree, so callers propagate the
356 : // empty-string fallback contract.
357 : std::string EmitSetOperationItem(
358 : const ::googlesql::ResolvedSetOperationItem* item,
359 : const ::googlesql::ResolvedSetOperationScan* parent);
360 :
361 : // Look up (or assign) the DuckDB `$N` slot for a named GoogleSQL
362 : // parameter. The first reference appends a new `ParameterRef` to
363 : // `parameter_order_`; subsequent references reuse the same slot
364 : // so a query like `WHERE @x AND @x` carries only one bind value.
365 : // The returned slot is 1-based to match DuckDB's `$1` notation.
366 : int LookupOrAssignNamedParameter(absl::string_view name);
367 :
368 : // Append a new positional `ParameterRef` and return its 1-based
369 : // slot. Positional GoogleSQL parameters are referentially distinct
370 : // every time they appear, so we never dedupe them.
371 : int AssignPositionalParameter(int analyzer_position);
372 :
373 : // Bind-order accumulator. Each entry is a `ParameterRef` whose
374 : // index `i` is the 0-based DuckDB placeholder slot (slot `i` is
375 : // `$<i+1>`). Named parameter lookups go through
376 : // `name_to_slot_` for dedup; positional ones always append.
377 : std::vector<ParameterRef> parameter_order_{};
378 : absl::flat_hash_map<std::string, int> name_to_slot_{};
379 :
380 : // Stack of `WITH RECURSIVE` CTE contexts. `EmitWithScan` pushes
381 : // one entry per recursive CTE before lowering its body; the
382 : // matching `EmitRecursiveRefScan` reads the back of the stack to
383 : // discover the DuckDB-side CTE name and the analyzer-stable
384 : // anchor column names the recursive arm references. Non-recursive
385 : // entries do not touch the stack so a non-recursive CTE nested
386 : // inside a recursive arm still sees the outer recursive context
387 : // through the stack.
388 : struct RecursiveCteContext {
389 : std::string cte_name;
390 : std::vector<std::string> column_names;
391 : };
392 : std::vector<RecursiveCteContext> recursive_cte_stack_{};
393 :
394 : // Set when the immediately-wrapped scan emit is a JOIN that projects
395 : // analyzer columns under `__bq_j_<column_id>` aliases.
396 : bool join_output_uses_id_aliases_ = false;
397 : // Sticky for the current query: any JOIN in the scan tree used id
398 : // aliases, so analytic-captured ORDER BY keys must remap by column_id.
399 : bool join_id_aliases_in_query_ = false;
400 : // True while the active scan subtree still exposes join output as
401 : // __bq_j_<column_id> (cleared when a computed ProjectScan renames).
402 : bool join_output_columns_use_id_aliases_ = false;
403 :
404 : // Set when the transpiled scan tree exposes `__bq_input_rn` (UNNEST
405 : // ordinality or an explicit row_number() stamp).
406 : bool input_has_rn_column_ = false;
407 : // Set when the final query needs ORDER BY using `__bq_input_rn` /
408 : // analytic output order (window queries only).
409 : bool input_rn_ordering_ = false;
410 : // Final ORDER BY keys derived from the first analytic group's
411 : // PARTITION BY / ORDER BY spec (BigQuery result ordering).
412 : std::vector<std::string> output_order_items_{};
413 : // Parallel `ResolvedColumn::column_id()` for each output_order_items_
414 : // entry; -1 when the key is not a simple column ref (expr / rn).
415 : std::vector<int> output_order_column_ids_{};
416 : // True when the user-visible SELECT list includes `__bq_input_rn`.
417 : bool output_includes_input_rn_ = false;
418 : // Suppresses appending `__bq_input_rn` to ProjectScan output when
419 : // lowering the inner scan of a subquery expression (IN/SCALAR/...).
420 : bool suppress_rn_in_project_ = false;
421 : // User-visible output column names from `ResolvedQueryStmt`; used to
422 : // decide whether PARTITION BY keys belong in the final ORDER BY.
423 : std::vector<std::string> query_output_column_names_{};
424 : };
425 :
426 : } // namespace transpiler
427 : } // namespace duckdb
428 : } // namespace engine
429 : } // namespace backend
430 : } // namespace bigquery_emulator
431 :
432 : #endif // BIGQUERY_EMULATOR_BACKEND_ENGINE_DUCKDB_TRANSPILER_TRANSPILER_H_
|