Line data Source code
1 : #ifndef BIGQUERY_EMULATOR_BACKEND_CATALOG_GOOGLESQL_CATALOG_H_
2 : #define BIGQUERY_EMULATOR_BACKEND_CATALOG_GOOGLESQL_CATALOG_H_
3 :
4 : // GoogleSqlCatalog is the GoogleSQL-facing catalog that the analyzer
5 : // consults during name resolution. It is a thin
6 : // adapter: every `FindTable` lookup is forwarded to the active
7 : // `backend::storage::Storage` instance through
8 : // `Storage::GetSchema`, and the engine-agnostic
9 : // `schema::TableSchema` is converted on the fly into a
10 : // `googlesql::SimpleTable` whose columns carry the matching
11 : // `googlesql::Type*` allocations from the supplied `TypeFactory`.
12 : //
13 : // The adapter intentionally does no preloading: BigQuery datasets
14 : // can hold millions of tables and the analyzer touches only the ones
15 : // referenced by the query, so the adapter materializes each
16 : // `SimpleTable` the first time it's named and then caches it for
17 : // the lifetime of the catalog instance. The cache is protected by an
18 : // `absl::Mutex` so the catalog is safe to share across analyzer
19 : // threads, but each instance is normally per-query: the engine
20 : // constructs one when a `Query.ExecuteQuery` RPC arrives, hands it
21 : // to GoogleSQL, and discards it when the query completes.
22 : //
23 : // Name resolution rules (mirroring BigQuery's REST identifier shape):
24 : //
25 : // * One-element paths resolve to `<default_dataset>.<table>` inside
26 : // `project_id_` when `default_dataset_id_` is non-empty (BigQuery
27 : // `defaultDataset` on `jobs.query`).
28 : // * Two-element paths are interpreted as `<dataset>.<table>` inside
29 : // `project_id_`, except `INFORMATION_SCHEMA.<view>` which is the
30 : // project-scoped metadata view shape.
31 : // * Three-element paths are interpreted as
32 : // `<project>.<dataset>.<table>` when the middle segment is not
33 : // `INFORMATION_SCHEMA`, otherwise as
34 : // `<dataset>.INFORMATION_SCHEMA.<view>`.
35 : // * Table ids ending in `*` resolve as BigQuery wildcard tables
36 : // (UNION ALL of every matching physical table in the dataset).
37 : // * Anything else (zero or four-plus elements) returns `NOT_FOUND`.
38 : //
39 : // Materialized tables are `backend::catalog::StorageTable` instances
40 : // (a `SimpleTable` subclass with a working
41 : // `CreateEvaluatorTableIterator` that streams rows out of the
42 : // underlying `Storage`). The catalog drives analyzer name
43 : // resolution; the DuckDB engine then reads the resolved AST and
44 : // executes through DuckDB directly.
45 : //
46 : // The catalog inherits from `googlesql::SimpleCatalog` so the
47 : // analyzer can look up GoogleSQL built-in functions and types
48 : // through the standard `SimpleCatalog::AddBuiltinFunctionsAndTypes`
49 : // path -- the constructor wires that up once per query so the
50 : // analyzer sees `COUNT`, `SUM`, `CONCAT`, and friends. We override
51 : // `FindTable` to short-circuit the SimpleCatalog default and hit
52 : // `Storage` directly for the BigQuery `<dataset>.<table>` /
53 : // `<project>.<dataset>.<table>` path shapes.
54 :
55 : #include <memory>
56 : #include <string>
57 : #include <vector>
58 :
59 : #include "absl/base/thread_annotations.h"
60 : #include "absl/status/status.h"
61 : #include "absl/status/statusor.h"
62 : #include "absl/strings/string_view.h"
63 : #include "absl/synchronization/mutex.h"
64 : #include "absl/types/span.h"
65 : #include "backend/catalog/storage_table.h"
66 : #include "backend/schema/schema.h"
67 : #include "backend/storage/storage.h"
68 : #include "googlesql/public/analyzer_output.h"
69 : #include "googlesql/public/catalog.h"
70 : #include "googlesql/public/language_options.h"
71 : #include "googlesql/public/simple_catalog.h"
72 : #include "googlesql/public/type.h"
73 : #include "googlesql/public/types/type_factory.h"
74 :
75 : namespace bigquery_emulator {
76 : namespace backend {
77 : namespace catalog {
78 :
79 : // Language options for catalog builtin registration. Must stay aligned
80 : // with `frontend/handlers/query_internal.cc::MakeAnalyzerOptions` and
81 : // `local_coordinator_analyze.cc::MakeAnalyzerOptionsBase` so internal
82 : // templated builtins (e.g. `$with_side_effects` for LIKE ANY lists)
83 : // resolve during analysis.
84 84 : inline ::googlesql::LanguageOptions MakeCatalogLanguageOptions() {
85 84 : ::googlesql::LanguageOptions language;
86 84 : language.EnableMaximumLanguageFeaturesForDevelopment();
87 84 : language.EnableLanguageFeature(::googlesql::FEATURE_WITH_EXPRESSION);
88 84 : language.EnableLanguageFeature(::googlesql::FEATURE_MATCH_RECOGNIZE);
89 84 : language.EnableLanguageFeature(
90 84 : ::googlesql::FEATURE_STRATIFIED_RESERVOIR_TABLESAMPLE);
91 84 : language.EnableLanguageFeature(::googlesql::FEATURE_KLL_WEIGHTS);
92 84 : language.EnableLanguageFeature(::googlesql::FEATURE_CREATE_TABLE_CLONE);
93 84 : language.EnableLanguageFeature(::googlesql::FEATURE_CREATE_SNAPSHOT_TABLE);
94 84 : language.EnableLanguageFeature(::googlesql::FEATURE_CLONE_DATA);
95 84 : language.EnableLanguageFeature(::googlesql::FEATURE_REMOTE_MODEL);
96 84 : language.EnableLanguageFeature(::googlesql::FEATURE_ENABLE_MEASURES);
97 84 : language.set_product_mode(::googlesql::PRODUCT_EXTERNAL);
98 84 : language.set_name_resolution_mode(::googlesql::NAME_RESOLUTION_DEFAULT);
99 84 : language.SetSupportsAllStatementKinds();
100 84 : return language;
101 84 : }
102 :
103 : class GoogleSqlCatalog : public ::googlesql::SimpleCatalog {
104 : public:
105 : // `storage` and `type_factory` must outlive the catalog. The
106 : // catalog does not take ownership; the typical lifetime is
107 : // `engine constructs storage + type_factory at startup` and the
108 : // catalog is constructed per query and discarded after the query
109 : // completes.
110 : //
111 : // `project_id` is the implicit project for two-element table
112 : // paths. It must be non-empty; the BigQuery REST surface always
113 : // supplies a project on `jobs.query` / `jobs.insert` requests.
114 : // `language` controls which GoogleSQL feature set is registered on
115 : // the catalog (via `SimpleCatalog::AddBuiltinFunctionsAndTypes`).
116 : // Pass the same `LanguageOptions` the analyzer is configured with
117 : // so the analyzer and the catalog agree on what's resolvable.
118 : GoogleSqlCatalog(absl::string_view project_id,
119 : storage::Storage* storage,
120 : ::googlesql::TypeFactory* type_factory,
121 : const ::googlesql::LanguageOptions& language,
122 : absl::string_view default_dataset_id = "");
123 :
124 : ~GoogleSqlCatalog() override = default;
125 :
126 : GoogleSqlCatalog(const GoogleSqlCatalog&) = delete;
127 : GoogleSqlCatalog& operator=(const GoogleSqlCatalog&) = delete;
128 :
129 : // SimpleCatalog::FullName() returns the catalog name we passed to
130 : // its constructor; mirror the previous "project as catalog name"
131 : // contract by always returning the project_id.
132 0 : std::string FullName() const override {
133 0 : return project_id_;
134 0 : }
135 :
136 : // The `TypeFactory` the caller pinned for this catalog's lifetime.
137 : // Analyzer passes must allocate resolved types through this factory
138 : // so pointers in the AST stay valid until the query completes.
139 0 : ::googlesql::TypeFactory* type_factory() const {
140 0 : return type_factory_;
141 0 : }
142 :
143 0 : storage::Storage* storage() const {
144 0 : return storage_;
145 0 : }
146 :
147 : // Path resolution rules are documented in the file header. A miss
148 : // returns `absl::StatusCode::kNotFound`; any other status indicates
149 : // a storage-level failure mid-lookup.
150 : absl::Status FindTable(const absl::Span<const std::string>& path,
151 : const ::googlesql::Table** table,
152 : const FindOptions& options = FindOptions()) override
153 : ABSL_LOCKS_EXCLUDED(mu_);
154 :
155 : absl::Status FindFunction(
156 : const absl::Span<const std::string>& path,
157 : const ::googlesql::Function** function,
158 : const FindOptions& options = FindOptions()) override;
159 :
160 : absl::Status FindModel(const absl::Span<const std::string>& path,
161 : const ::googlesql::Model** model,
162 : const FindOptions& options = FindOptions()) override
163 : ABSL_LOCKS_EXCLUDED(mu_);
164 :
165 : absl::Status FindTableValuedFunction(
166 : const absl::Span<const std::string>& path,
167 : const ::googlesql::TableValuedFunction** function,
168 : const FindOptions& options = FindOptions()) override;
169 :
170 : absl::Status FindProcedure(
171 : const absl::Span<const std::string>& path,
172 : const ::googlesql::Procedure** procedure,
173 : const FindOptions& options = FindOptions()) override;
174 :
175 : // Convert a `schema::ColumnSchema` into a freshly-allocated
176 : // `googlesql::Type*`. Public so other catalog adapters can reuse
177 : // the same translation when they build their typed columns.
178 : //
179 : // The mapping is total for the BigQuery scalar / structural types
180 : // defined on `schema::ColumnType`. Unknown / unsupported types
181 : // (`kUnknown`, `kGeography`) yield `INVALID_ARGUMENT` so the
182 : // analyzer fails fast instead of silently substituting a string.
183 : // ARRAY-mode columns wrap the inner scalar/struct type in an
184 : // `ARRAY<T>`; STRUCT columns recurse on `fields`.
185 : static absl::StatusOr<const ::googlesql::Type*> ToGoogleSqlType(
186 : const schema::ColumnSchema& column,
187 : ::googlesql::TypeFactory* type_factory);
188 :
189 : private:
190 : // Looks up `dataset_id.table_id` in `project` via `storage_->GetSchema`
191 : // and materializes a `SimpleTable` for it, populating `table_cache_`
192 : // so repeated lookups during the same analysis pass return the
193 : // same pointer. The returned pointer is owned by the cache and
194 : // stays valid for the lifetime of the catalog.
195 : struct MaterializedTableBuild {
196 : StorageTable* table = nullptr;
197 : schema::TableSchema logical_schema;
198 : };
199 :
200 : absl::StatusOr<MaterializedTableBuild> MaterializeTablePhysical(
201 : absl::string_view project_id,
202 : absl::string_view dataset_id,
203 : absl::string_view table_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
204 :
205 : absl::StatusOr<const ::googlesql::Table*> MaterializeInfoSchemaView(
206 : absl::string_view project_id,
207 : absl::string_view dataset_id,
208 : absl::string_view view_name) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
209 :
210 : absl::StatusOr<const ::googlesql::Table*> MaterializeWildcardTable(
211 : absl::string_view project_id,
212 : absl::string_view dataset_id,
213 : absl::string_view wildcard_table_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
214 :
215 : // Cache key shape mirrors `storage::TableId` so a query that
216 : // qualifies the same table two different ways (with vs. without
217 : // the project prefix) still collapses onto one cache entry.
218 : static std::string CacheKey(absl::string_view project_id,
219 : absl::string_view dataset_id,
220 : absl::string_view table_id);
221 :
222 : const std::string project_id_;
223 : const std::string default_dataset_id_;
224 : storage::Storage* const storage_ = nullptr;
225 : ::googlesql::TypeFactory* const type_factory_ = nullptr;
226 :
227 : mutable absl::Mutex mu_;
228 : // Owns every catalog table we hand out to the analyzer / evaluator.
229 : // Keys are `CacheKey(project, dataset, table)`; values are non-null.
230 : std::vector<std::unique_ptr<::googlesql::SimpleTable>> tables_
231 : ABSL_GUARDED_BY(mu_){};
232 : // Parallel-by-index lookup from cache key to `tables_` entry.
233 : // `flat_hash_map` would be cleaner but the catalog stays small per
234 : // query (BigQuery queries rarely reference more than a handful of
235 : // tables) and a linear scan keeps the dependency surface narrow.
236 : std::vector<std::string> keys_ ABSL_GUARDED_BY(mu_){};
237 : // Non-owning cache of registry-backed views (owned by view_registry).
238 : std::vector<std::string> registered_view_keys_ ABSL_GUARDED_BY(mu_){};
239 : std::vector<const ::googlesql::Table*> registered_views_
240 : ABSL_GUARDED_BY(mu_){};
241 : // Holds analyzer outputs for measure column expressions so resolved
242 : // pointers on `SimpleTable` columns stay valid for the catalog lifetime.
243 : std::vector<std::unique_ptr<const ::googlesql::AnalyzerOutput>>
244 : measure_outputs_ ABSL_GUARDED_BY(mu_){};
245 : std::vector<std::unique_ptr<const ::googlesql::ResolvedExpr>>
246 : measure_resolved_exprs_ ABSL_GUARDED_BY(mu_){};
247 : };
248 :
249 : } // namespace catalog
250 : } // namespace backend
251 : } // namespace bigquery_emulator
252 :
253 : #endif // BIGQUERY_EMULATOR_BACKEND_CATALOG_GOOGLESQL_CATALOG_H_
|