LCOV - code coverage report
Current view: top level - backend/catalog - googlesql_catalog.h (source / functions) Coverage Total Hit
Test: _coverage_report.dat Lines: 66.7 % 27 18
Test Date: 2026-07-02 21:01:18 Functions: 25.0 % 4 1

            Line data    Source code
       1              : #ifndef BIGQUERY_EMULATOR_BACKEND_CATALOG_GOOGLESQL_CATALOG_H_
       2              : #define BIGQUERY_EMULATOR_BACKEND_CATALOG_GOOGLESQL_CATALOG_H_
       3              : 
       4              : // GoogleSqlCatalog is the GoogleSQL-facing catalog that the analyzer
       5              : // consults during name resolution. It is a thin
       6              : // adapter: every `FindTable` lookup is forwarded to the active
       7              : // `backend::storage::Storage` instance through
       8              : // `Storage::GetSchema`, and the engine-agnostic
       9              : // `schema::TableSchema` is converted on the fly into a
      10              : // `googlesql::SimpleTable` whose columns carry the matching
      11              : // `googlesql::Type*` allocations from the supplied `TypeFactory`.
      12              : //
      13              : // The adapter intentionally does no preloading: BigQuery datasets
      14              : // can hold millions of tables and the analyzer touches only the ones
      15              : // referenced by the query, so the adapter materializes each
      16              : // `SimpleTable` the first time it's named and then caches it for
      17              : // the lifetime of the catalog instance. The cache is protected by an
      18              : // `absl::Mutex` so the catalog is safe to share across analyzer
      19              : // threads, but each instance is normally per-query: the engine
      20              : // constructs one when a `Query.ExecuteQuery` RPC arrives, hands it
      21              : // to GoogleSQL, and discards it when the query completes.
      22              : //
      23              : // Name resolution rules (mirroring BigQuery's REST identifier shape):
      24              : //
      25              : //   * One-element paths resolve to `<default_dataset>.<table>` inside
      26              : //     `project_id_` when `default_dataset_id_` is non-empty (BigQuery
      27              : //     `defaultDataset` on `jobs.query`).
      28              : //   * Two-element paths are interpreted as `<dataset>.<table>` inside
      29              : //     `project_id_`, except `INFORMATION_SCHEMA.<view>` which is the
      30              : //     project-scoped metadata view shape.
      31              : //   * Three-element paths are interpreted as
      32              : //     `<project>.<dataset>.<table>` when the middle segment is not
      33              : //     `INFORMATION_SCHEMA`, otherwise as
      34              : //     `<dataset>.INFORMATION_SCHEMA.<view>`.
      35              : //   * Table ids ending in `*` resolve as BigQuery wildcard tables
      36              : //     (UNION ALL of every matching physical table in the dataset).
      37              : //   * Anything else (zero or four-plus elements) returns `NOT_FOUND`.
      38              : //
      39              : // Materialized tables are `backend::catalog::StorageTable` instances
      40              : // (a `SimpleTable` subclass with a working
      41              : // `CreateEvaluatorTableIterator` that streams rows out of the
      42              : // underlying `Storage`). The catalog drives analyzer name
      43              : // resolution; the DuckDB engine then reads the resolved AST and
      44              : // executes through DuckDB directly.
      45              : //
      46              : // The catalog inherits from `googlesql::SimpleCatalog` so the
      47              : // analyzer can look up GoogleSQL built-in functions and types
      48              : // through the standard `SimpleCatalog::AddBuiltinFunctionsAndTypes`
      49              : // path -- the constructor wires that up once per query so the
      50              : // analyzer sees `COUNT`, `SUM`, `CONCAT`, and friends. We override
      51              : // `FindTable` to short-circuit the SimpleCatalog default and hit
      52              : // `Storage` directly for the BigQuery `<dataset>.<table>` /
      53              : // `<project>.<dataset>.<table>` path shapes.
      54              : 
      55              : #include <memory>
      56              : #include <string>
      57              : #include <vector>
      58              : 
      59              : #include "absl/base/thread_annotations.h"
      60              : #include "absl/status/status.h"
      61              : #include "absl/status/statusor.h"
      62              : #include "absl/strings/string_view.h"
      63              : #include "absl/synchronization/mutex.h"
      64              : #include "absl/types/span.h"
      65              : #include "backend/catalog/storage_table.h"
      66              : #include "backend/schema/schema.h"
      67              : #include "backend/storage/storage.h"
      68              : #include "googlesql/public/analyzer_output.h"
      69              : #include "googlesql/public/catalog.h"
      70              : #include "googlesql/public/language_options.h"
      71              : #include "googlesql/public/simple_catalog.h"
      72              : #include "googlesql/public/type.h"
      73              : #include "googlesql/public/types/type_factory.h"
      74              : 
      75              : namespace bigquery_emulator {
      76              : namespace backend {
      77              : namespace catalog {
      78              : 
      79              : // Language options for catalog builtin registration. Must stay aligned
      80              : // with `frontend/handlers/query_internal.cc::MakeAnalyzerOptions` and
      81              : // `local_coordinator_analyze.cc::MakeAnalyzerOptionsBase` so internal
      82              : // templated builtins (e.g. `$with_side_effects` for LIKE ANY lists)
      83              : // resolve during analysis.
      84           84 : inline ::googlesql::LanguageOptions MakeCatalogLanguageOptions() {
      85           84 :   ::googlesql::LanguageOptions language;
      86           84 :   language.EnableMaximumLanguageFeaturesForDevelopment();
      87           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_WITH_EXPRESSION);
      88           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_MATCH_RECOGNIZE);
      89           84 :   language.EnableLanguageFeature(
      90           84 :       ::googlesql::FEATURE_STRATIFIED_RESERVOIR_TABLESAMPLE);
      91           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_KLL_WEIGHTS);
      92           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_CREATE_TABLE_CLONE);
      93           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_CREATE_SNAPSHOT_TABLE);
      94           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_CLONE_DATA);
      95           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_REMOTE_MODEL);
      96           84 :   language.EnableLanguageFeature(::googlesql::FEATURE_ENABLE_MEASURES);
      97           84 :   language.set_product_mode(::googlesql::PRODUCT_EXTERNAL);
      98           84 :   language.set_name_resolution_mode(::googlesql::NAME_RESOLUTION_DEFAULT);
      99           84 :   language.SetSupportsAllStatementKinds();
     100           84 :   return language;
     101           84 : }
     102              : 
     103              : class GoogleSqlCatalog : public ::googlesql::SimpleCatalog {
     104              :  public:
     105              :   // `storage` and `type_factory` must outlive the catalog. The
     106              :   // catalog does not take ownership; the typical lifetime is
     107              :   // `engine constructs storage + type_factory at startup` and the
     108              :   // catalog is constructed per query and discarded after the query
     109              :   // completes.
     110              :   //
     111              :   // `project_id` is the implicit project for two-element table
     112              :   // paths. It must be non-empty; the BigQuery REST surface always
     113              :   // supplies a project on `jobs.query` / `jobs.insert` requests.
     114              :   // `language` controls which GoogleSQL feature set is registered on
     115              :   // the catalog (via `SimpleCatalog::AddBuiltinFunctionsAndTypes`).
     116              :   // Pass the same `LanguageOptions` the analyzer is configured with
     117              :   // so the analyzer and the catalog agree on what's resolvable.
     118              :   GoogleSqlCatalog(absl::string_view project_id,
     119              :                    storage::Storage* storage,
     120              :                    ::googlesql::TypeFactory* type_factory,
     121              :                    const ::googlesql::LanguageOptions& language,
     122              :                    absl::string_view default_dataset_id = "");
     123              : 
     124              :   ~GoogleSqlCatalog() override = default;
     125              : 
     126              :   GoogleSqlCatalog(const GoogleSqlCatalog&) = delete;
     127              :   GoogleSqlCatalog& operator=(const GoogleSqlCatalog&) = delete;
     128              : 
     129              :   // SimpleCatalog::FullName() returns the catalog name we passed to
     130              :   // its constructor; mirror the previous "project as catalog name"
     131              :   // contract by always returning the project_id.
     132            0 :   std::string FullName() const override {
     133            0 :     return project_id_;
     134            0 :   }
     135              : 
     136              :   // The `TypeFactory` the caller pinned for this catalog's lifetime.
     137              :   // Analyzer passes must allocate resolved types through this factory
     138              :   // so pointers in the AST stay valid until the query completes.
     139            0 :   ::googlesql::TypeFactory* type_factory() const {
     140            0 :     return type_factory_;
     141            0 :   }
     142              : 
     143            0 :   storage::Storage* storage() const {
     144            0 :     return storage_;
     145            0 :   }
     146              : 
     147              :   // Path resolution rules are documented in the file header. A miss
     148              :   // returns `absl::StatusCode::kNotFound`; any other status indicates
     149              :   // a storage-level failure mid-lookup.
     150              :   absl::Status FindTable(const absl::Span<const std::string>& path,
     151              :                          const ::googlesql::Table** table,
     152              :                          const FindOptions& options = FindOptions()) override
     153              :       ABSL_LOCKS_EXCLUDED(mu_);
     154              : 
     155              :   absl::Status FindFunction(
     156              :       const absl::Span<const std::string>& path,
     157              :       const ::googlesql::Function** function,
     158              :       const FindOptions& options = FindOptions()) override;
     159              : 
     160              :   absl::Status FindModel(const absl::Span<const std::string>& path,
     161              :                          const ::googlesql::Model** model,
     162              :                          const FindOptions& options = FindOptions()) override
     163              :       ABSL_LOCKS_EXCLUDED(mu_);
     164              : 
     165              :   absl::Status FindTableValuedFunction(
     166              :       const absl::Span<const std::string>& path,
     167              :       const ::googlesql::TableValuedFunction** function,
     168              :       const FindOptions& options = FindOptions()) override;
     169              : 
     170              :   absl::Status FindProcedure(
     171              :       const absl::Span<const std::string>& path,
     172              :       const ::googlesql::Procedure** procedure,
     173              :       const FindOptions& options = FindOptions()) override;
     174              : 
     175              :   // Convert a `schema::ColumnSchema` into a freshly-allocated
     176              :   // `googlesql::Type*`. Public so other catalog adapters can reuse
     177              :   // the same translation when they build their typed columns.
     178              :   //
     179              :   // The mapping is total for the BigQuery scalar / structural types
     180              :   // defined on `schema::ColumnType`. Unknown / unsupported types
     181              :   // (`kUnknown`, `kGeography`) yield `INVALID_ARGUMENT` so the
     182              :   // analyzer fails fast instead of silently substituting a string.
     183              :   // ARRAY-mode columns wrap the inner scalar/struct type in an
     184              :   // `ARRAY<T>`; STRUCT columns recurse on `fields`.
     185              :   static absl::StatusOr<const ::googlesql::Type*> ToGoogleSqlType(
     186              :       const schema::ColumnSchema& column,
     187              :       ::googlesql::TypeFactory* type_factory);
     188              : 
     189              :  private:
     190              :   // Looks up `dataset_id.table_id` in `project` via `storage_->GetSchema`
     191              :   // and materializes a `SimpleTable` for it, populating `table_cache_`
     192              :   // so repeated lookups during the same analysis pass return the
     193              :   // same pointer. The returned pointer is owned by the cache and
     194              :   // stays valid for the lifetime of the catalog.
     195              :   struct MaterializedTableBuild {
     196              :     StorageTable* table = nullptr;
     197              :     schema::TableSchema logical_schema;
     198              :   };
     199              : 
     200              :   absl::StatusOr<MaterializedTableBuild> MaterializeTablePhysical(
     201              :       absl::string_view project_id,
     202              :       absl::string_view dataset_id,
     203              :       absl::string_view table_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
     204              : 
     205              :   absl::StatusOr<const ::googlesql::Table*> MaterializeInfoSchemaView(
     206              :       absl::string_view project_id,
     207              :       absl::string_view dataset_id,
     208              :       absl::string_view view_name) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
     209              : 
     210              :   absl::StatusOr<const ::googlesql::Table*> MaterializeWildcardTable(
     211              :       absl::string_view project_id,
     212              :       absl::string_view dataset_id,
     213              :       absl::string_view wildcard_table_id) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
     214              : 
     215              :   // Cache key shape mirrors `storage::TableId` so a query that
     216              :   // qualifies the same table two different ways (with vs. without
     217              :   // the project prefix) still collapses onto one cache entry.
     218              :   static std::string CacheKey(absl::string_view project_id,
     219              :                               absl::string_view dataset_id,
     220              :                               absl::string_view table_id);
     221              : 
     222              :   const std::string project_id_;
     223              :   const std::string default_dataset_id_;
     224              :   storage::Storage* const storage_ = nullptr;
     225              :   ::googlesql::TypeFactory* const type_factory_ = nullptr;
     226              : 
     227              :   mutable absl::Mutex mu_;
     228              :   // Owns every catalog table we hand out to the analyzer / evaluator.
     229              :   // Keys are `CacheKey(project, dataset, table)`; values are non-null.
     230              :   std::vector<std::unique_ptr<::googlesql::SimpleTable>> tables_
     231              :       ABSL_GUARDED_BY(mu_){};
     232              :   // Parallel-by-index lookup from cache key to `tables_` entry.
     233              :   // `flat_hash_map` would be cleaner but the catalog stays small per
     234              :   // query (BigQuery queries rarely reference more than a handful of
     235              :   // tables) and a linear scan keeps the dependency surface narrow.
     236              :   std::vector<std::string> keys_ ABSL_GUARDED_BY(mu_){};
     237              :   // Non-owning cache of registry-backed views (owned by view_registry).
     238              :   std::vector<std::string> registered_view_keys_ ABSL_GUARDED_BY(mu_){};
     239              :   std::vector<const ::googlesql::Table*> registered_views_
     240              :       ABSL_GUARDED_BY(mu_){};
     241              :   // Holds analyzer outputs for measure column expressions so resolved
     242              :   // pointers on `SimpleTable` columns stay valid for the catalog lifetime.
     243              :   std::vector<std::unique_ptr<const ::googlesql::AnalyzerOutput>>
     244              :       measure_outputs_ ABSL_GUARDED_BY(mu_){};
     245              :   std::vector<std::unique_ptr<const ::googlesql::ResolvedExpr>>
     246              :       measure_resolved_exprs_ ABSL_GUARDED_BY(mu_){};
     247              : };
     248              : 
     249              : }  // namespace catalog
     250              : }  // namespace backend
     251              : }  // namespace bigquery_emulator
     252              : 
     253              : #endif  // BIGQUERY_EMULATOR_BACKEND_CATALOG_GOOGLESQL_CATALOG_H_
        

Generated by: LCOV version 2.0-1