Line data Source code
1 : #ifndef BIGQUERY_EMULATOR_BACKEND_ENGINE_DUCKDB_TRANSPILER_TRANSPILER_INTERNAL_H_
2 : #define BIGQUERY_EMULATOR_BACKEND_ENGINE_DUCKDB_TRANSPILER_TRANSPILER_INTERNAL_H_
3 :
4 : // Shared helpers for DuckDB transpiler emit translation units.
5 :
6 : #include <cstdint>
7 : #include <optional>
8 : #include <string>
9 : #include <vector>
10 :
11 : #include "absl/container/flat_hash_map.h"
12 : #include "absl/container/flat_hash_set.h"
13 : #include "absl/strings/ascii.h"
14 : #include "absl/strings/str_cat.h"
15 : #include "absl/strings/str_join.h"
16 : #include "absl/strings/str_replace.h"
17 : #include "absl/strings/string_view.h"
18 : #include "absl/time/time.h"
19 : #include "googlesql/public/type.h"
20 : #include "googlesql/public/types/struct_type.h"
21 : #include "googlesql/public/value.h"
22 : #include "googlesql/resolved_ast/resolved_ast.h"
23 : #include "googlesql/resolved_ast/resolved_node.h"
24 :
25 : namespace bigquery_emulator {
26 : namespace backend {
27 : namespace engine {
28 : namespace duckdb {
29 : namespace transpiler {
30 : namespace internal {
31 :
32 : // Double-quote escape a DuckDB identifier. DuckDB doubles embedded
33 : // `"` characters; we do the same so column / table names with quotes
34 : // or hyphens round-trip safely through the emitted SQL.
35 2843 : inline std::string QuoteIdent(absl::string_view name) {
36 2843 : return absl::StrCat("\"", absl::StrReplaceAll(name, {{"\"", "\"\""}}), "\"");
37 2843 : }
38 :
39 : // Single-quote escape a DuckDB string literal. DuckDB doubles embedded
40 : // `'` characters; we do the same so BQ string literals with embedded
41 : // apostrophes round-trip safely. Used for both ResolvedLiteral
42 : // strings and for STRUCT field-name keys in `{'k': v}` literals.
43 65 : inline std::string QuoteString(absl::string_view text) {
44 65 : return absl::StrCat("'", absl::StrReplaceAll(text, {{"'", "''"}}), "'");
45 65 : }
46 :
47 : inline std::string ResolveFunctionName(const ::googlesql::Function* fn) {
48 : if (fn == nullptr) return "";
49 : return absl::AsciiStrToLower(fn->FullName(/*include_group=*/false));
50 : }
51 :
52 : inline std::string WrapArrayAggRespectNulls(absl::string_view body,
53 : absl::string_view arg) {
54 : return absl::StrCat(
55 : "if(count(",
56 : arg,
57 : ") < count(*), error('ARRAY_AGG: input value must be not null'), ",
58 : body,
59 : ")");
60 : }
61 :
62 : inline constexpr const char kBqInputRnCol[] = "__bq_input_rn";
63 :
64 : inline std::string JoinColumnIdAlias(int column_id) {
65 : return QuoteIdent(absl::StrCat("__bq_j_", column_id));
66 : }
67 : inline constexpr const char kBqPctCoalesceCol[] = "__bq_pct_coalesce";
68 : inline constexpr char kBqPctNullSentinel[] = "'!__BQ_NULL__!'";
69 : inline constexpr const char kBqUnionOrdCol[] = "__bq_union_ord";
70 :
71 : // Suffix for ORDER BY direction + NULL ordering.
72 : inline std::string OrderByItemSuffix(
73 : const ::googlesql::ResolvedOrderByItem* item,
74 0 : bool bigquery_null_defaults = false) {
75 0 : const char* dir = item->is_descending() ? "DESC" : "ASC";
76 0 : const char* nulls = "";
77 0 : switch (item->null_order()) {
78 0 : case ::googlesql::ResolvedOrderByItem::NULLS_FIRST:
79 0 : nulls = " NULLS FIRST";
80 0 : break;
81 0 : case ::googlesql::ResolvedOrderByItem::NULLS_LAST:
82 0 : nulls = " NULLS LAST";
83 0 : break;
84 0 : case ::googlesql::ResolvedOrderByItem::ORDER_UNSPECIFIED:
85 0 : default:
86 0 : if (bigquery_null_defaults) {
87 0 : nulls = item->is_descending() ? " NULLS LAST" : " NULLS FIRST";
88 0 : }
89 0 : break;
90 0 : }
91 0 : return absl::StrCat(" ", dir, nulls);
92 0 : }
93 :
94 : inline std::optional<std::string> TryLiteralString(
95 0 : const ::googlesql::ResolvedExpr* expr) {
96 0 : if (expr == nullptr || expr->node_kind() != ::googlesql::RESOLVED_LITERAL) {
97 0 : return std::nullopt;
98 0 : }
99 0 : const auto* lit = expr->GetAs<::googlesql::ResolvedLiteral>();
100 0 : if (lit == nullptr) return std::nullopt;
101 0 : const ::googlesql::Value& v = lit->value();
102 0 : if (v.is_null() || v.type_kind() != ::googlesql::TYPE_STRING) {
103 0 : return std::nullopt;
104 0 : }
105 0 : return v.string_value();
106 0 : }
107 :
108 : // DuckDB BLOB literals use per-byte `\xHH` escapes inside single quotes
109 : // (`'\x61\x62\x63'::BLOB`). A bare hex digit run (`'616263'::BLOB`) is
110 : // six ASCII bytes, not three decoded bytes.
111 0 : inline std::string EmitBlobLiteral(absl::string_view bytes) {
112 0 : static const char kHex[] = "0123456789abcdef";
113 0 : std::string escaped;
114 0 : escaped.reserve(bytes.size() * 4);
115 0 : for (unsigned char c : bytes) {
116 0 : escaped.push_back('\\');
117 0 : escaped.push_back('x');
118 0 : escaped.push_back(kHex[c >> 4]);
119 0 : escaped.push_back(kHex[c & 0xf]);
120 0 : }
121 0 : return absl::StrCat("'", escaped, "'::BLOB");
122 0 : }
123 :
124 : // Synthesize a stable DuckDB-side field name for a BigQuery STRUCT
125 : // field that was declared without one (e.g. `STRUCT(1, 'a')`). DuckDB
126 : // requires every struct field to be named, so we pick a positional
127 : // scheme (`_0`, `_1`, ...) and use the *same* convention everywhere
128 : // the transpiler emits SQL that mentions the field:
129 : //
130 : // * `EmitValueLiteral` (folded constant struct) and `EmitMakeStruct`
131 : // emit the synthesized name as the key in `{'_<i>': <value>}`.
132 : // * `EmitGetStructField` resolves a positional access to the same
133 : // synthesized name on the dotted form (`<expr>."_<i>"`).
134 : //
135 : // Stable, monotonic positional names match BigQuery's positional
136 : // field-order semantics one-for-one and keep the conformance harness
137 : // from having to round-trip the BQ-side name (which is empty
138 : // regardless of how the user spelled the access).
139 2 : inline std::string SynthesizeAnonymousFieldName(int idx) {
140 2 : return absl::StrCat("_", idx);
141 2 : }
142 :
143 : // Pick the DuckDB field name to use for STRUCT field `idx` of type
144 : // `st`. Returns the analyzer's name when set, or the synthesized
145 : // positional name (`_<idx>`) for an anonymous field. Centralizing the
146 : // choice keeps `EmitValueLiteral`, `EmitMakeStruct`, and
147 : // `EmitGetStructField` aligned -- a drift between the literal/maker
148 : // emit and the field-access emit would silently produce DuckDB
149 : // "field does not exist" runtime errors.
150 : inline std::string ResolveStructFieldName(const ::googlesql::StructType& st,
151 12 : int idx) {
152 12 : const ::googlesql::StructField& f = st.field(idx);
153 12 : if (f.name.empty()) return SynthesizeAnonymousFieldName(idx);
154 10 : return f.name;
155 12 : }
156 :
157 : // Lower a GoogleSQL `Value` into a DuckDB SQL literal expression.
158 : //
159 : // Scalars route through `Value::GetSQLLiteral(PRODUCT_EXTERNAL)`
160 : // because that path already matches DuckDB syntax for INT / FLOAT /
161 : // BOOL / DATE / NUMERIC / DATETIME etc. Strings, arrays, and structs
162 : // each need a bespoke shape:
163 : //
164 : // * Strings: DuckDB reads double-quoted text as an *identifier*, so we
165 : // emit the single-quoted form (`'hi'`).
166 : // * Arrays: DuckDB's array literal is `[e1, e2, ...]`, same shape as
167 : // GoogleSQL's `kSQLLiteral` output, but we recurse so nested
168 : // STRINGs / STRUCTs get the DuckDB-flavored quoting above instead
169 : // of GoogleSQL's `"..."` and `(...)` shapes.
170 : // * Structs: DuckDB struct literals are `{'k1': v1, 'k2': v2, ...}`
171 : // keyed by name. BQ STRUCT field order is positional (the type
172 : // carries the names), so we walk the StructType for the keys in
173 : // parallel with the value list. Anonymous BigQuery fields (empty
174 : // name) get a synthesized positional name (`_0`, `_1`, ...) via
175 : // `ResolveStructFieldName` so the literal emits as
176 : // `{'_0': 1, '_1': 'a'}`; `EmitGetStructField` uses the same
177 : // convention on the access side.
178 : //
179 : inline std::string FormatDateLiteral(int32_t days_since_epoch) {
180 : int32_t z = days_since_epoch + 719468;
181 : int32_t era = (z >= 0 ? z : z - 146096) / 146097;
182 : unsigned doe = static_cast<unsigned>(z - era * 146097);
183 : unsigned yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
184 : int y = static_cast<int>(yoe) + era * 400;
185 : unsigned doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
186 : unsigned mp = (5 * doy + 2) / 153;
187 : unsigned d = doy - (153 * mp + 2) / 5 + 1;
188 : unsigned m = mp < 10 ? mp + 3 : mp - 9;
189 : if (m <= 2) ++y;
190 : return absl::StrFormat("%04d-%02u-%02u", y, m, d);
191 : }
192 :
193 : // Returns the empty string when any element / field cannot be lowered;
194 : // callers propagate that up so the engine fallback fires per the
195 : // per-shape disposition in SHAPE_TRACKER.md.
196 : inline std::string EmitValueLiteral(const ::googlesql::Value& v) {
197 : if (v.is_null()) return "NULL";
198 : const ::googlesql::Type* type = v.type();
199 : if (type == nullptr) return "";
200 : switch (type->kind()) {
201 : case ::googlesql::TYPE_STRING:
202 : return QuoteString(v.string_value());
203 : case ::googlesql::TYPE_BYTES:
204 : return EmitBlobLiteral(v.bytes_value());
205 : case ::googlesql::TYPE_ARRAY: {
206 : std::vector<std::string> elems;
207 : elems.reserve(v.num_elements());
208 : for (int i = 0; i < v.num_elements(); ++i) {
209 : std::string e = EmitValueLiteral(v.element(i));
210 : if (e.empty()) return "";
211 : elems.push_back(std::move(e));
212 : }
213 : return absl::StrCat("[", absl::StrJoin(elems, ", "), "]");
214 : }
215 : case ::googlesql::TYPE_STRUCT: {
216 : const ::googlesql::StructType* st = type->AsStruct();
217 : if (st == nullptr || st->num_fields() != v.num_fields()) return "";
218 : std::vector<std::string> kvs;
219 : kvs.reserve(v.num_fields());
220 : for (int i = 0; i < v.num_fields(); ++i) {
221 : std::string fv = EmitValueLiteral(v.field(i));
222 : if (fv.empty()) return "";
223 : kvs.push_back(absl::StrCat(
224 : QuoteString(ResolveStructFieldName(*st, i)), ": ", fv));
225 : }
226 : return absl::StrCat("{", absl::StrJoin(kvs, ", "), "}");
227 : }
228 : case ::googlesql::TYPE_TIMESTAMP: {
229 : const absl::TimeZone utc = absl::UTCTimeZone();
230 : const absl::Time t = v.ToTime();
231 : const int64_t micros = absl::ToUnixMicros(t);
232 : std::string formatted;
233 : if (micros % 1000000 == 0) {
234 : formatted = absl::FormatTime("%Y-%m-%d %H:%M:%S+00", t, utc);
235 : } else {
236 : formatted = absl::StrCat(
237 : absl::FormatTime("%Y-%m-%d %H:%M:%E6S", t, utc), "+00");
238 : }
239 : return absl::StrCat("CAST(", QuoteString(formatted), " AS TIMESTAMPTZ)");
240 : }
241 : case ::googlesql::TYPE_DATETIME: {
242 : std::string out = v.datetime_value().DebugString();
243 : const size_t sep = out.find(' ');
244 : if (sep != std::string::npos) {
245 : out[sep] = 'T';
246 : }
247 : return absl::StrCat("CAST(", QuoteString(out), " AS TIMESTAMP)");
248 : }
249 : case ::googlesql::TYPE_DATE:
250 : return absl::StrCat(
251 : "CAST(", QuoteString(FormatDateLiteral(v.date_value())), " AS DATE)");
252 : case ::googlesql::TYPE_TIME:
253 : return absl::StrCat(
254 : "CAST(", QuoteString(v.time_value().DebugString()), " AS TIME)");
255 : default:
256 : return v.GetSQLLiteral(::googlesql::PRODUCT_EXTERNAL);
257 : }
258 : }
259 :
260 : // BigQuery STRUCT-to-STRUCT casts match fields by positional index;
261 : // DuckDB `CAST(... AS STRUCT(...))` requires overlapping field names.
262 : // Remap each target field from the source struct via dotted access and
263 : // emit a DuckDB struct literal keyed by the target names.
264 : inline std::string EmitStructPositionalCastRemap(
265 : absl::string_view inner,
266 : const ::googlesql::StructType& source_st,
267 1 : const ::googlesql::StructType& target_st) {
268 1 : if (source_st.num_fields() != target_st.num_fields()) return "";
269 1 : std::vector<std::string> kvs;
270 1 : kvs.reserve(target_st.num_fields());
271 1 : const std::string wrapped = absl::StrCat("(", inner, ")");
272 3 : for (int i = 0; i < target_st.num_fields(); ++i) {
273 2 : const std::string source_field = ResolveStructFieldName(source_st, i);
274 2 : const std::string target_field = ResolveStructFieldName(target_st, i);
275 2 : kvs.push_back(absl::StrCat(QuoteString(target_field),
276 2 : ": ",
277 2 : wrapped,
278 2 : ".",
279 2 : QuoteIdent(source_field)));
280 2 : }
281 1 : return absl::StrCat("{", absl::StrJoin(kvs, ", "), "}");
282 1 : }
283 :
284 : // Whitelist of GoogleSQL `TypeKind`s the `EmitCast` path will lower.
285 : // `DuckDBSqlTypeName` itself is intentionally total (it falls through
286 : // to `VARCHAR` for unsupported kinds so column-def emit always
287 : // compiles), but for `CAST(<expr> AS T)` we'd rather take the engine
288 : // fallback than silently retype `GEOGRAPHY` / proto / enum / range /
289 : // graph values to a DuckDB string -- the runtime semantics would not
290 : // match the BigQuery cast contract.
291 : inline bool IsCastTargetSupported(::googlesql::TypeKind kind) {
292 : switch (kind) {
293 : case ::googlesql::TYPE_BOOL:
294 : case ::googlesql::TYPE_INT32:
295 : case ::googlesql::TYPE_INT64:
296 : case ::googlesql::TYPE_UINT32:
297 : case ::googlesql::TYPE_UINT64:
298 : case ::googlesql::TYPE_FLOAT:
299 : case ::googlesql::TYPE_DOUBLE:
300 : case ::googlesql::TYPE_STRING:
301 : case ::googlesql::TYPE_BYTES:
302 : case ::googlesql::TYPE_DATE:
303 : case ::googlesql::TYPE_TIME:
304 : case ::googlesql::TYPE_DATETIME:
305 : case ::googlesql::TYPE_TIMESTAMP:
306 : case ::googlesql::TYPE_NUMERIC:
307 : case ::googlesql::TYPE_BIGNUMERIC:
308 : case ::googlesql::TYPE_JSON:
309 : case ::googlesql::TYPE_INTERVAL:
310 : case ::googlesql::TYPE_UUID:
311 : case ::googlesql::TYPE_ARRAY:
312 : case ::googlesql::TYPE_STRUCT:
313 : return true;
314 : default:
315 : return false;
316 : }
317 : }
318 :
319 0 : inline std::string OrderItemLeadingColumn(const std::string& item) {
320 0 : if (item.empty() || item[0] != '"') return "";
321 0 : const size_t end = item.find('"', 1);
322 0 : if (end == std::string::npos) return "";
323 0 : return item.substr(0, end + 1);
324 0 : }
325 :
326 : inline std::string RemapOrderItemForJoinAliases(const std::string& item,
327 : int column_id,
328 : bool join_id_aliases) {
329 : if (!join_id_aliases || column_id < 0) return item;
330 : const std::string leading = OrderItemLeadingColumn(item);
331 : if (leading.empty()) return item;
332 : return absl::StrCat(JoinColumnIdAlias(column_id),
333 : item.substr(leading.size()));
334 : }
335 :
336 : inline std::string OrderColumnExprForWrap(const std::string& quoted_name,
337 : int column_id,
338 : bool join_id_aliases) {
339 : if (join_id_aliases && column_id >= 0) {
340 : return JoinColumnIdAlias(column_id);
341 : }
342 : return quoted_name;
343 : }
344 :
345 : inline bool OutputListContainsColumn(
346 : absl::string_view quoted_col, const ::googlesql::ResolvedQueryStmt* node) {
347 : if (node == nullptr) return false;
348 : for (int i = 0; i < node->output_column_list_size(); ++i) {
349 : const ::googlesql::ResolvedOutputColumn* out = node->output_column_list(i);
350 : if (out == nullptr) continue;
351 : if (quoted_col == QuoteIdent(out->column().name())) return true;
352 : }
353 : return false;
354 : }
355 :
356 : inline std::vector<std::string> ExtraOrderColumnsForWrap(
357 : const std::vector<std::string>& order_items,
358 : const ::googlesql::ResolvedQueryStmt* node,
359 : const std::vector<int>* order_column_ids = nullptr,
360 0 : bool join_id_aliases = false) {
361 0 : std::vector<std::string> extra;
362 0 : for (size_t i = 0; i < order_items.size(); ++i) {
363 0 : const std::string& item = order_items[i];
364 0 : const std::string col = OrderItemLeadingColumn(item);
365 0 : if (col.empty() || col == QuoteIdent(kBqInputRnCol)) continue;
366 0 : if (OutputListContainsColumn(col, node)) continue;
367 0 : const int column_id =
368 0 : order_column_ids != nullptr && i < order_column_ids->size()
369 0 : ? (*order_column_ids)[i]
370 0 : : -1;
371 0 : const std::string proj =
372 0 : OrderColumnExprForWrap(col, column_id, join_id_aliases);
373 0 : if (std::find(extra.begin(), extra.end(), proj) == extra.end()) {
374 0 : extra.push_back(proj);
375 0 : }
376 0 : }
377 0 : return extra;
378 0 : }
379 :
380 : // Drop implicit ORDER BY keys that are not projected by a column-reducing
381 : // scan (e.g. SELECT DISTINCT / GROUP BY after an analytic dedup window).
382 : inline void FilterOutputOrderItemsByProjectedColumns(
383 : std::vector<std::string>* order_items,
384 : std::vector<int>* order_column_ids,
385 : const absl::flat_hash_set<std::string>& projected_quoted_names) {
386 : if (order_items == nullptr || order_items->empty()) return;
387 : std::vector<std::string> filtered;
388 : std::vector<int> filtered_ids;
389 : filtered.reserve(order_items->size());
390 : filtered_ids.reserve(order_items->size());
391 : for (size_t i = 0; i < order_items->size(); ++i) {
392 : const std::string col = OrderItemLeadingColumn((*order_items)[i]);
393 : if (col.empty()) continue;
394 : if (projected_quoted_names.contains(col)) {
395 : filtered.push_back((*order_items)[i]);
396 : filtered_ids.push_back(order_column_ids != nullptr &&
397 : i < order_column_ids->size()
398 : ? (*order_column_ids)[i]
399 : : -1);
400 : }
401 : }
402 : *order_items = std::move(filtered);
403 : if (order_column_ids != nullptr) {
404 : *order_column_ids = std::move(filtered_ids);
405 : }
406 : }
407 :
408 : // `CASE val WHEN w1 THEN t1 ... ELSE e END` for analyzer `$case_with_value`.
409 0 : inline std::string EmitCaseWithValue(const std::vector<std::string>& args) {
410 0 : if (args.size() < 2 || (args.size() % 2) != 0) return "";
411 0 : std::string sql = absl::StrCat("CASE ", args[0], " ");
412 0 : for (size_t i = 1; i + 1 < args.size(); i += 2) {
413 0 : absl::StrAppend(&sql, "WHEN ", args[i], " THEN ", args[i + 1], " ");
414 0 : }
415 0 : absl::StrAppend(&sql, "ELSE ", args.back(), " END");
416 0 : return sql;
417 0 : }
418 :
419 : // `CASE WHEN c1 THEN t1 ... ELSE e END` for analyzer `$case_no_value`.
420 0 : inline std::string EmitCaseNoValue(const std::vector<std::string>& args) {
421 0 : if (args.size() < 1 || (args.size() % 2) == 0) return "";
422 0 : std::string sql = "CASE ";
423 0 : for (size_t i = 0; i + 1 < args.size(); i += 2) {
424 0 : absl::StrAppend(&sql, "WHEN ", args[i], " THEN ", args[i + 1], " ");
425 0 : }
426 0 : absl::StrAppend(&sql, "ELSE ", args.back(), " END");
427 0 : return sql;
428 0 : }
429 :
430 : inline bool SupportsOrderedAggregateModifiers(absl::string_view name) {
431 : return name == "array_agg" || name == "string_agg" ||
432 : name == "array_concat_agg";
433 : }
434 :
435 : inline bool SupportsAnalyticNullHandling(absl::string_view name) {
436 : return name == "first_value" || name == "last_value" || name == "nth_value";
437 : }
438 :
439 : inline std::string AnalyticNullHandlingSuffix(
440 : const ::googlesql::ResolvedAnalyticFunctionCall* node) {
441 : if (node == nullptr) return "";
442 : switch (node->null_handling_modifier()) {
443 : case ::googlesql::ResolvedNonScalarFunctionCallBase::IGNORE_NULLS:
444 : return " IGNORE NULLS";
445 : case ::googlesql::ResolvedNonScalarFunctionCallBase::RESPECT_NULLS:
446 : return " RESPECT NULLS";
447 : default:
448 : return "";
449 : }
450 : }
451 :
452 : inline std::string AppendArrayAggNullFilter(absl::string_view body,
453 : absl::string_view arg,
454 : bool ignore_nulls) {
455 : if (!ignore_nulls) return std::string(body);
456 : return absl::StrCat(body, " FILTER (WHERE ", arg, " IS NOT NULL)");
457 : }
458 :
459 : bool ScanTreeContainsAnalytic(const ::googlesql::ResolvedScan* scan);
460 : bool AnalyticOrderNeedsInputRn(
461 : const ::googlesql::ResolvedAnalyticFunctionGroup* group);
462 : bool AnalyticGroupNeedsInputRnForEmptyOrder(
463 : const ::googlesql::ResolvedAnalyticFunctionGroup* group);
464 : bool AnalyticGroupHasRangeFrame(
465 : const ::googlesql::ResolvedAnalyticFunctionGroup* group);
466 : bool AggregateScanNeedsInputRn(const ::googlesql::ResolvedAggregateScan* node);
467 : } // namespace internal
468 : } // namespace transpiler
469 : } // namespace duckdb
470 : } // namespace engine
471 : } // namespace backend
472 : } // namespace bigquery_emulator
473 :
474 : #endif // BIGQUERY_EMULATOR_BACKEND_ENGINE_DUCKDB_TRANSPILER_TRANSPILER_INTERNAL_H_
|