Line data Source code
1 : #ifndef BIGQUERY_EMULATOR_BACKEND_STORAGE_DUCKDB_DUCKDB_STORAGE_H_
2 : #define BIGQUERY_EMULATOR_BACKEND_STORAGE_DUCKDB_DUCKDB_STORAGE_H_
3 :
4 : // DuckDBStorage is the persistent, file-backed `Storage` implementation.
5 : //
6 : // Layout under `data_dir`:
7 : //
8 : // <data_dir>/
9 : // catalog.duckdb # DuckDB catalog file
10 : // <project_id>/ # one dir per project
11 : // <dataset_id>/ # one dir per dataset
12 : // _dataset.meta.json # dataset-level metadata
13 : // <table_id>.parquet # data file
14 : // <table_id>.meta.json # per-table sidecar
15 : //
16 : // The DuckDB catalog file tracks dataset existence (as DuckDB schemas)
17 : // and table existence (as DuckDB views over the matching parquet file).
18 : // BigQuery-specific metadata that does not fit cleanly in DuckDB
19 : // (description, labels, friendlyName, etag, and the BigQuery-typed
20 : // schema) lives in the JSON sidecars so a developer can inspect, edit,
21 : // or hand-author a dataset/table without going through the emulator.
22 : //
23 : // This header is the *core* skeleton: it owns the connection, the
24 : // directory layout, the metadata sidecar, and dataset/table CRUD.
25 : // The actual Parquet I/O for `AppendRows` / `ScanRows` lands in the
26 : // follow-up plan `duckdb-storage-ddl_p1e2f3a4`; both methods return
27 : // UNIMPLEMENTED until then.
28 : //
29 : // Concurrency: every public method acquires a single absl::Mutex.
30 : // DuckDB itself is thread-safe per-connection but we serialize at
31 : // the C++ level so dataset / table directory mutations stay in
32 : // lockstep with catalog rows.
33 :
34 : #include <filesystem>
35 : #include <optional>
36 : #include <string>
37 : #include <vector>
38 :
39 : #include "absl/base/thread_annotations.h"
40 : #include "absl/status/status.h"
41 : #include "absl/status/statusor.h"
42 : #include "absl/strings/string_view.h"
43 : #include "absl/synchronization/mutex.h"
44 : #include "absl/types/span.h"
45 : #include "backend/schema/schema.h"
46 : #include "backend/storage/storage.h"
47 :
48 : namespace bigquery_emulator {
49 : namespace backend {
50 : namespace storage {
51 : namespace duckdb {
52 :
53 : class DuckDBStorage : public Storage {
54 : public:
55 : // Constructs a DuckDBStorage rooted at `data_dir`. The directory is
56 : // created (recursively) if it does not exist. Opens a DuckDB
57 : // connection backed by `<data_dir>/catalog.duckdb` so dataset /
58 : // table existence survives process restarts.
59 : //
60 : // Returns INVALID_ARGUMENT when `data_dir` is empty, FAILED_PRECONDITION
61 : // when the directory can not be created (e.g. permission denied), or
62 : // INTERNAL when DuckDB itself refuses to open the catalog file. On
63 : // success the caller owns the returned unique_ptr; the connection
64 : // closes on destruction.
65 : static absl::StatusOr<std::unique_ptr<DuckDBStorage>> Open(
66 : absl::string_view data_dir);
67 :
68 : ~DuckDBStorage() override;
69 :
70 : DuckDBStorage(const DuckDBStorage&) = delete;
71 : DuckDBStorage& operator=(const DuckDBStorage&) = delete;
72 :
73 : // Path the storage was opened with. Stable for the lifetime of the
74 : // instance; exposed mainly for tests / logs.
75 0 : absl::string_view data_dir() const override {
76 0 : return data_dir_;
77 0 : }
78 :
79 : // ------------------------------------------------------------------
80 : // Storage interface
81 : // ------------------------------------------------------------------
82 : absl::Status CreateDataset(const DatasetId& id,
83 : absl::string_view location) override;
84 : absl::Status DropDataset(const DatasetId& id,
85 : bool delete_contents,
86 : absl::string_view rest_metadata_json = {}) override;
87 : absl::Status RestoreDataset(const DatasetId& id,
88 : std::int64_t deleted_ms = 0) override;
89 : absl::StatusOr<std::vector<DatasetId>> ListDatasets(
90 : absl::string_view project_id) const override;
91 :
92 : absl::Status CreateTable(const TableId& id,
93 : const schema::TableSchema& schema) override;
94 : absl::Status DropTable(const TableId& id) override;
95 : absl::Status RestoreTable(const TableId& id,
96 : std::int64_t deleted_ms = 0) override;
97 : absl::StatusOr<std::vector<TableId>> ListTables(
98 : const DatasetId& dataset_id) const override;
99 :
100 : absl::StatusOr<schema::TableSchema> GetSchema(
101 : const TableId& id) const override;
102 :
103 : // The core skeleton returns UNIMPLEMENTED for these two. The DDL
104 : // plan (`duckdb-storage-ddl_p1e2f3a4`) lowers them onto Parquet
105 : // I/O via DuckDB's `read_parquet` + INSERT statements.
106 : absl::Status AppendRows(const TableId& id,
107 : absl::Span<const Row> rows) override;
108 : absl::Status OverwriteRows(const TableId& id,
109 : absl::Span<const Row> rows) override;
110 : absl::StatusOr<std::unique_ptr<RowIterator>> ScanRows(
111 : const TableId& id) const override;
112 : absl::StatusOr<std::unique_ptr<RowIterator>> CreateReadStream(
113 : const TableId& id, const ReadFilter& filter) const override;
114 : absl::StatusOr<std::int64_t> CountRows(const TableId& id) const override;
115 :
116 : std::optional<std::string> ParquetSnapshotPath(
117 : const TableId& id) const override;
118 : absl::StatusOr<std::optional<std::string>> ParquetSnapshotPathAt(
119 : const TableId& id, std::int64_t as_of_ms) const override;
120 :
121 : absl::Status UpsertRoutine(const RoutineRecord& record) override;
122 : absl::Status DeleteRoutine(const RoutineId& id) override;
123 : absl::StatusOr<RoutineRecord> GetRoutine(const RoutineId& id) const override;
124 : absl::StatusOr<std::vector<RoutineRecord>> ListRoutines(
125 : const DatasetId& dataset_id) const override;
126 : absl::StatusOr<std::vector<RoutineRecord>> ListAllRoutines() const override;
127 :
128 : absl::Status UpsertView(const ViewRecord& record) override;
129 : absl::Status DeleteView(const ViewId& id) override;
130 : absl::StatusOr<std::vector<ViewRecord>> ListAllViews() const override;
131 : absl::StatusOr<TableResourceInfo> GetTableResourceInfo(
132 : const TableId& id) const override;
133 :
134 : absl::StatusOr<TableGovernance> GetTableGovernance(
135 : const TableId& id) const override;
136 : absl::Status UpsertRowAccessPolicy(
137 : const TableId& id, const RowAccessPolicyRecord& policy) override;
138 : absl::Status DeleteRowAccessPolicy(const TableId& id,
139 : absl::string_view policy_id) override;
140 : absl::Status SetColumnGovernance(
141 : const TableId& id,
142 : absl::string_view column_name,
143 : const ColumnGovernanceRecord& column) override;
144 :
145 : // Ensures catalog metadata tables (e.g. `__bqemu_routines`) exist.
146 : // Called from `Open` and idempotently before routine CRUD.
147 : absl::Status InitCatalogTables();
148 :
149 : absl::StatusOr<std::string> GetDatasetRestMetadataJson(
150 : const DatasetId& id) const;
151 :
152 : // Dataset tombstone helpers (caller must hold mu_).
153 : absl::Status SnapshotDatasetRegistryForTombstoneLocked(
154 : const DatasetId& id, const std::filesystem::path& tombstone_dir);
155 : absl::Status RestoreDatasetRegistryFromTombstoneLocked(
156 : const DatasetId& id, const std::filesystem::path& tombstone_dir);
157 : absl::Status PurgeDatasetRegistryRowsLocked(const DatasetId& id);
158 : absl::StatusOr<std::string> GetDatasetRestMetadataJsonLocked(
159 : const DatasetId& id) const;
160 :
161 : // Pimpl: keeps the DuckDB C handles out of this header so the
162 : // engine-agnostic Storage signatures stay enforceable from the
163 : // include graph alone (callers cannot accidentally reach into
164 : // `duckdb_database` / `duckdb_connection`). Public so the
165 : // translation unit's helper functions can take an `Impl*` directly
166 : // — the struct itself is only ever defined inside duckdb_storage.cc.
167 : struct Impl;
168 :
169 : private:
170 : DuckDBStorage(std::string data_dir, std::unique_ptr<Impl> impl);
171 :
172 : // Filesystem layout helpers. All take ids by string_view and emit
173 : // absolute paths under `data_dir_`.
174 : std::string DatasetDir(absl::string_view project_id,
175 : absl::string_view dataset_id) const;
176 : std::string DatasetDir(const DatasetId& id) const;
177 : std::string DatasetMetaPath(const DatasetId& id) const;
178 : std::string TableMetaPath(const TableId& id) const;
179 : std::string TableParquetPath(const TableId& id) const;
180 :
181 : std::string TableGovernancePath(const TableId& id) const;
182 :
183 : absl::Status PutTableGovernance(const TableId& id,
184 : const TableGovernance& gov);
185 :
186 : // Stable DuckDB schema name for a (project, dataset) pair. We can't
187 : // just use the dataset_id because two projects may share a dataset
188 : // id; collapse them into one safe identifier so DuckDB stays happy.
189 : static std::string DuckDBSchemaName(absl::string_view project_id,
190 : absl::string_view dataset_id);
191 : static std::string DuckDBSchemaName(const DatasetId& id);
192 :
193 : std::string data_dir_;
194 : mutable absl::Mutex mu_;
195 : // The pointer is set once at construction and never reassigned; the
196 : // *contents* of the connection are guarded by `mu_` because DuckDB
197 : // is thread-safe per-connection but the dataset/table directory
198 : // mutations need to stay coherent with the DuckDB catalog rows we
199 : // emit alongside them.
200 : std::unique_ptr<Impl> impl_{};
201 : };
202 :
203 : } // namespace duckdb
204 : } // namespace storage
205 : } // namespace backend
206 : } // namespace bigquery_emulator
207 :
208 : #endif // BIGQUERY_EMULATOR_BACKEND_STORAGE_DUCKDB_DUCKDB_STORAGE_H_
|