Line data Source code
1 : #ifndef BIGQUERY_EMULATOR_BACKEND_ENGINE_ENGINE_H_
2 : #define BIGQUERY_EMULATOR_BACKEND_ENGINE_ENGINE_H_
3 :
4 : // Engine is the C++ engine's query execution interface.
5 : //
6 : // The only implementation lives at `backend/engine/duckdb/`: it
7 : // transpiles the GoogleSQL ResolvedAST into DuckDB SQL via a custom
8 : // visitor and executes it through DuckDB's C++ client.
9 : //
10 : // This header defines the abstract surface only. The
11 : // `googlesql::Catalog` parameter is forward-declared so this header
12 : // stays free of any GoogleSQL include dependency. The `AnalyzedQuery`
13 : // and `RowSource` opaque interfaces let us return a resolved AST
14 : // handle and a streamed result without leaking engine-specific types
15 : // up to the gRPC handlers.
16 :
17 : #include <cstdint>
18 : #include <memory>
19 : #include <string>
20 : #include <utility>
21 : #include <vector>
22 :
23 : #include "absl/status/status.h"
24 : #include "absl/status/statusor.h"
25 : #include "absl/strings/string_view.h"
26 : #include "backend/engine/phase_recorder.h"
27 : #include "backend/schema/schema.h"
28 : #include "backend/storage/storage.h"
29 :
30 : // Forward-declared so this header does not pull in any GoogleSQL
31 : // headers. The DuckDB engine downcasts the `googlesql::Catalog*` to
32 : // its own catalog adapter when it actually runs analysis.
33 : namespace googlesql {
34 : class Catalog;
35 : } // namespace googlesql
36 :
37 : namespace bigquery_emulator {
38 : namespace backend {
39 : namespace engine {
40 :
41 : // One BigQuery query parameter (named or positional). `value_json`
42 : // carries the JSON-encoded literal value the gateway received on the
43 : // REST request; the engine round-trips it through GoogleSQL's literal
44 : // parser at analysis time.
45 : struct QueryParameter {
46 : // Empty for positional parameters (BigQuery's `@0`, `@1`, ...).
47 : std::string name;
48 : // GoogleSQL `TypeKind` name, e.g. "INT64", "STRING".
49 : std::string type_kind;
50 : std::string value_json;
51 : // Gateway-encoded REST `parameterType` descriptor for STRUCT/ARRAY
52 : // parameters (field names and nested type kinds); empty for scalars.
53 : std::string type_json;
54 : };
55 :
56 : // One query the engine is asked to plan or execute. The fields mirror
57 : // `bigquery_emulator.v1.QueryRequest` from `proto/emulator.proto`.
58 : struct QueryRequest {
59 : std::string project_id;
60 : // Default dataset for unqualified table references. May be empty.
61 : std::string default_dataset_id;
62 : std::string sql;
63 : std::vector<QueryParameter> parameters;
64 : // BigQuery defaults `useLegacySql` to true on the wire; the gateway
65 : // rejects that case (see the gateway-HTTP-surface section of
66 : // ROADMAP.md) so by the time a request reaches here this field
67 : // should always be false. We keep it as a
68 : // belt-and-braces field so the engine can also error out if the
69 : // gateway ever stops enforcing.
70 : bool use_legacy_sql = false;
71 : // Synthetic principal from the gateway; defaults to
72 : // catalog::kEmulatorPrincipalEmail when empty.
73 : std::string principal_email;
74 : // Optional per-query phase recorder populated by the frontend and
75 : // filled by coordinator / executor paths for loopback diagnostics.
76 : PhaseRecorderPtr phase_recorder;
77 : };
78 :
79 : // Opaque handle for a parsed + name-resolved query. The DuckDB engine
80 : // hides its own ResolvedAST plus any side state (extracted
81 : // parameters, default dataset, etc.) behind this interface.
82 : class AnalyzedQuery {
83 : public:
84 : virtual ~AnalyzedQuery();
85 :
86 : // The schema of the rows the query will produce on
87 : // `Engine::ExecuteQuery`.
88 : virtual const schema::TableSchema& output_schema() const = 0;
89 : };
90 :
91 : // Streamed query result. The engine produces rows one at a time;
92 : // `Next` returns false on end-of-stream. The DuckDB engine batches
93 : // internally and streams rows out one-by-one.
94 : class RowSource {
95 : public:
96 : virtual ~RowSource();
97 :
98 : virtual const schema::TableSchema& schema() const = 0;
99 :
100 : // Pulls the next row into `*row`. Returns:
101 : // * `true` - a row was written.
102 : // * `false` - end of stream; `*row` is unchanged.
103 : // A non-OK status indicates an execution error; further calls are
104 : // undefined.
105 : virtual absl::StatusOr<bool> Next(storage::Row* row) = 0;
106 : };
107 :
108 : // Result of a `DryRun`. Mirrors the BigQuery
109 : // `Job.statistics.query.{schema,totalBytesProcessed}` shape the
110 : // gateway exposes on `jobs.query?dryRun=true`.
111 : struct DryRunResult {
112 : schema::TableSchema schema;
113 : int64_t estimated_bytes_processed = 0;
114 : };
115 :
116 : // Result of `Engine::ExecuteDml`: per-statement modification counts
117 : // for an INSERT / UPDATE / DELETE / MERGE statement. Mirrors the
118 : // BigQuery REST `Job.statistics.query.dmlStats` envelope; the
119 : // frontend handler folds these counts into a final
120 : // `QueryResultRow.dml_stats` message on the `Query.ExecuteQuery`
121 : // stream.
122 : struct DmlStats {
123 : // Number of rows added by INSERT / MERGE-INSERT branches.
124 : int64_t inserted_row_count = 0;
125 : // Number of rows updated by UPDATE / MERGE-UPDATE branches.
126 : int64_t updated_row_count = 0;
127 : // Number of rows removed by DELETE / MERGE-DELETE branches.
128 : int64_t deleted_row_count = 0;
129 : };
130 :
131 : // Result of `Engine::ExecuteDml` when the statement may carry a
132 : // `THEN RETURN` clause. `returning_rows` is non-null only when the
133 : // resolved AST includes `ResolvedReturningClause`; the frontend
134 : // streams its schema + rows before the trailing `dml_stats` message.
135 : struct DmlResult {
136 : DmlStats stats;
137 : std::unique_ptr<RowSource> returning_rows;
138 : };
139 :
140 : // Engine is the abstract interface every query backend implements.
141 : //
142 : // Lifetime: created once at startup with a `Storage*` and a
143 : // `googlesql::Catalog*` already wired up; shared by every gRPC
144 : // request handler. All methods are thread-safe.
145 : class Engine {
146 : public:
147 : virtual ~Engine();
148 :
149 : // Parse + name-resolve + type-check `request.sql` against `catalog`.
150 : // Returns an opaque `AnalyzedQuery` the caller can hand back to
151 : // `DryRun` / `ExecuteQuery`, OR a parse / analysis error mapped to
152 : // the matching absl::Status code (the gateway translates that into
153 : // a BigQuery error envelope; see the analyzer integration section
154 : // of ROADMAP.md).
155 : //
156 : // `[[nodiscard]]` is on every Status / StatusOr-returning method
157 : // here for the same reason it is on `backend::storage::Storage`:
158 : // dropping the result silently swallows a parse / analysis error
159 : // that the gateway has no other channel to surface.
160 : [[nodiscard]] virtual absl::StatusOr<std::unique_ptr<AnalyzedQuery>> Analyze(
161 : const QueryRequest& request, googlesql::Catalog* catalog) = 0;
162 :
163 : // Plan-only path used by `jobs.query?dryRun=true`. Implementations
164 : // are free to short-circuit through `Analyze` internally.
165 : [[nodiscard]] virtual absl::StatusOr<DryRunResult> DryRun(
166 : const QueryRequest& request, googlesql::Catalog* catalog) = 0;
167 :
168 : // Plan + execute. The returned `RowSource` streams the result rows
169 : // back to the gateway one by one; the gateway paginates them out
170 : // through the `bigquery.jobs.query` and
171 : // `bigquery.jobs.getQueryResults` REST endpoints.
172 : [[nodiscard]] virtual absl::StatusOr<std::unique_ptr<RowSource>> ExecuteQuery(
173 : const QueryRequest& request, googlesql::Catalog* catalog) = 0;
174 :
175 : // Plan + execute a DML statement (INSERT / UPDATE / DELETE / MERGE)
176 : // and return the per-statement modification counts. The engine is
177 : // expected to apply the changes to the underlying `Storage` it was
178 : // constructed with -- callers do not see the modified rows, only
179 : // the count summary the gateway folds into BigQuery's
180 : // `dmlStats` / `numDmlAffectedRows` fields. Engines that do not
181 : // implement DML yet return `absl::StatusCode::kUnimplemented`; the
182 : // frontend handler maps that to gRPC `UNIMPLEMENTED` so the
183 : // gateway can surface BigQuery's `notImplemented` reason.
184 : [[nodiscard]] virtual absl::StatusOr<DmlResult> ExecuteDml(
185 0 : const QueryRequest& request, googlesql::Catalog* catalog) {
186 0 : (void)request;
187 0 : (void)catalog;
188 0 : return absl::UnimplementedError(
189 0 : "Engine::ExecuteDml is not implemented in this engine");
190 0 : }
191 :
192 : // Plan + execute a DDL statement
193 : // (CREATE TABLE / CREATE TABLE AS SELECT / DROP TABLE / ALTER TABLE
194 : // ADD COLUMN). The engine mutates the underlying `Storage` -- there
195 : // is no row-shaped reply, just success (OK) or a status mapped to
196 : // the matching gRPC code. Engines that do not implement DDL return
197 : // `absl::StatusCode::kUnimplemented`; the frontend handler maps
198 : // that to gRPC `UNIMPLEMENTED`.
199 : [[nodiscard]] virtual absl::Status ExecuteDdl(const QueryRequest& request,
200 0 : googlesql::Catalog* catalog) {
201 0 : (void)request;
202 0 : (void)catalog;
203 0 : return absl::UnimplementedError(
204 0 : "Engine::ExecuteDdl is not implemented in this engine");
205 0 : }
206 : };
207 :
208 : } // namespace engine
209 : } // namespace backend
210 : } // namespace bigquery_emulator
211 :
212 : #endif // BIGQUERY_EMULATOR_BACKEND_ENGINE_ENGINE_H_
|