ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
provenance_evaluate_compiled.cpp
Go to the documentation of this file.
1/**
2 * @file provenance_evaluate_compiled.cpp
3 * @brief SQL function @c provsql.provenance_evaluate_compiled() – C++ semiring evaluation.
4 *
5 * Implements the compiled (C++ generic) variant of provenance circuit
6 * evaluation. Unlike @c provenance_evaluate() (which calls user-supplied
7 * PostgreSQL functions for each semiring operation), this function
8 * evaluates the circuit using one of the built-in C++ semiring
9 * implementations from the @c semiring/ directory.
10 *
11 * Supported semirings (selected by the @p semiring argument):
12 * - @c "boolean" → @c semiring::Boolean
13 * - @c "counting" → @c semiring::Counting
14 * - @c "formula" → @c semiring::Formula (symbolic representation as a formula)
15 * - @c "how" → @c semiring::How (canonical polynomial provenance ℕ[X])
16 * - @c "why" → @c semiring::Why (witness sets)
17 * - @c "which" → @c semiring::Which (lineage)
18 * - @c "boolexpr" → @c semiring::BoolExpr (Boolean circuit for probability)
19 * - @c "tropical" → @c semiring::Tropical (min-plus, shortest-cost)
20 * - @c "viterbi" → @c semiring::Viterbi (max-times, most-likely derivation)
21 * - @c "lukasiewicz" → @c semiring::Lukasiewicz (Łukasiewicz fuzzy t-norm)
22 * - @c "interval_union" → @c semiring::IntervalUnion (multirange union, PG14+)
23 * for any of <tt>tstzmultirange</tt>, <tt>nummultirange</tt>,
24 * <tt>int4multirange</tt>; selected by the result type of the call
25 * - @c "minmax" / @c "maxmin" → @c semiring::MinMax over any user-defined
26 * PostgreSQL enum type, selected by @c get_typtype() == @c TYPTYPE_ENUM
27 *
28 * Each compiled semiring exposes @c parse_leaf() and (for text-valued
29 * carriers) @c to_text() member functions, so this file is purely a
30 * dispatcher: it picks the right semiring instance from the
31 * (result-type, semiring-name) pair, then calls the @c pec() template
32 * which runs the same four-step pipeline for every semiring (build leaf
33 * mapping, evaluate HAVING sub-circuits, evaluate the main circuit,
34 * encode the result as a Datum).
35 */
36extern "C" {
37#include "postgres.h"
38#include "fmgr.h"
39#include "catalog/pg_type.h"
40#include "utils/uuid.h"
41#include "utils/builtins.h"
42#include "utils/lsyscache.h"
43#include "provsql_utils.h"
44
45PG_FUNCTION_INFO_V1(provenance_evaluate_compiled);
46}
47
48#include <string>
49#include <vector>
50#include <unordered_map>
51#include <algorithm>
52
53#include "Expectation.h"
54#include "having_semantics.hpp"
56#include "semiring/Boolean.h"
57#include "semiring/Counting.h"
58#include "semiring/Formula.h"
59#include "semiring/How.h"
60#include "semiring/Why.h"
61#include "semiring/Which.h"
62#include "semiring/BoolExpr.h"
63#include "semiring/Tropical.h"
64#include "semiring/Viterbi.h"
67#include "semiring/MinMax.h"
68
69const char *drop_temp_table = "DROP TABLE IF EXISTS tmp_uuids;";
70
71namespace {
72
73/**
74 * @brief Wrap a @c std::string in a Postgres @c text Datum.
75 */
76Datum text_datum(const std::string &s) {
77 text *result = (text *) palloc(VARHDRSZ + s.size());
78 SET_VARSIZE(result, VARHDRSZ + s.size());
79 memcpy(VARDATA(result), s.c_str(), s.size());
80 return PointerGetDatum(result);
81}
82
83// to_datum overloads: encode a semiring's evaluation result as a Postgres Datum.
84Datum to_datum(const semiring::Boolean &, bool v) { return BoolGetDatum(v); }
85Datum to_datum(const semiring::Counting &, unsigned v) { return Int32GetDatum(static_cast<int32>(v)); }
86Datum to_datum(const semiring::Tropical &, double v) { return Float8GetDatum(v); }
87Datum to_datum(const semiring::Viterbi &, double v) { return Float8GetDatum(v); }
88Datum to_datum(const semiring::Lukasiewicz &, double v) { return Float8GetDatum(v); }
89Datum to_datum(const semiring::Formula &sr, const std::string &v) { return text_datum(sr.to_text(v)); }
90Datum to_datum(const semiring::Why &sr, const semiring::why_provenance_t &v) { return text_datum(sr.to_text(v)); }
91Datum to_datum(const semiring::How &sr, const semiring::how_provenance_t &v) { return text_datum(sr.to_text(v)); }
92Datum to_datum(const semiring::Which &sr, const semiring::which_provenance_t &v) { return text_datum(sr.to_text(v)); }
93#if PG_VERSION_NUM >= 140000
94Datum to_datum(const semiring::IntervalUnion &, Datum v) { return v; }
95#endif
96Datum to_datum(const semiring::MinMax &, Datum v) { return v; }
97
98/**
99 * @brief Run the four-step compiled-evaluation pipeline for a semiring.
100 *
101 * 1. Build the input-gate → leaf-value mapping by parsing each leaf via
102 * @c sr.parse_leaf().
103 * 2. Rewrite HAVING comparison gates in @p c via @c provsql_having().
104 * 3. Evaluate the main circuit at @p g.
105 * 4. Encode the result as a Postgres Datum via @c to_datum(sr, ...).
106 *
107 * @tparam Sem The compiled semiring class.
108 * @param constants Extension OID cache.
109 * @param c Generic circuit to evaluate.
110 * @param g Root gate.
111 * @param sr Semiring instance.
112 * @param drop_table Whether the temporary UUID table should be dropped.
113 * @return The semiring evaluation result encoded as a Datum.
114 */
115template <typename Sem>
116Datum pec(
117 const constants_t &constants,
119 gate_t g,
120 const Sem &sr,
121 bool drop_table)
122{
123 using V = typename Sem::value_type;
124 std::unordered_map<gate_t, V> mapping;
125 initialize_provenance_mapping<V>(constants, c, mapping,
126 [&sr](const char *v) { return sr.parse_leaf(v); }, drop_table);
127
128 provsql_having(c, g, mapping, sr);
129 V out = c.evaluate<Sem>(g, mapping, sr);
130 return to_datum(sr, std::move(out));
131}
132
133/**
134 * @brief Evaluate the BoolExpr semiring; bypasses the GenericCircuit pipeline.
135 *
136 * With @p labels, each input gate renders as its mapped label; without,
137 * leaves render as the default @c x@<id@> placeholders.
138 */
139Datum pec_boolexpr(
140 BooleanCircuit &bc,
141 gate_t root,
142 const std::unordered_map<gate_t, std::string> *labels)
143{
144 return text_datum(labels ? bc.toString(root, *labels) : bc.toString(root));
145}
146
147} // namespace
148
149/**
150 * @brief Join a provenance mapping table with a set of UUIDs using SPI.
151 * @param table OID of the provenance mapping relation.
152 * @param uuids List of UUID strings to join against.
153 * @return @c true if a temporary table was created (caller must drop it).
154 */
155bool join_with_temp_uuids(Oid table, const std::vector<std::string> &uuids) {
156 if (SPI_connect() != SPI_OK_CONNECT)
157 throw CircuitException("SPI_connect failed");
158
159 char *table_name = get_rel_name(table);
160 if (!table_name) {
161 SPI_finish();
162 throw CircuitException("Invalid OID: no such table");
163 }
164 // Schema-qualify so the lookup works regardless of search_path.
165 // get_rel_name alone would resolve unqualified, which fails when the
166 // mapping lives in a schema not on search_path (e.g. provsql_test in
167 // the regression DB, or any user schema reached via Studio with the
168 // default "public, provsql" search_path).
169 const char *qualified = quote_qualified_identifier(
170 get_namespace_name(get_rel_namespace(table)), table_name);
171
172 constexpr size_t nb_max_uuid_value = 10000000;
173
174 StringInfoData join_query;
175 initStringInfo(&join_query);
176 bool drop_table = false;
177
178 // Two different mechanisms to implement the join (unless there are no
179 // UUIDs):
180 // if there are less than nb_max_uuid_value UUIDs, we do a join
181 // with a VALUES() list;
182 // otherwise we create a temporary table where we dump the inserts
183 // and join with it.
184 if(uuids.size() == 0) {
185 appendStringInfo(&join_query,
186 "SELECT value, provenance FROM %s WHERE 'f'", qualified);
187 } else if(uuids.size() <= nb_max_uuid_value) {
188 appendStringInfo(&join_query,
189 "SELECT value, provenance FROM %s t JOIN (VALUES", qualified);
190 bool first=true;
191 for(auto u: uuids) {
192 appendStringInfo(&join_query, "%s('%s'::uuid)", first?"":",", u.c_str());
193 first=false;
194 }
195
196 appendStringInfo(&join_query, ") AS u(id) ON t.provenance=u.id");
197 } else {
198 const char *create_temp_table = "CREATE TEMP TABLE tmp_uuids(id uuid);";
199 if (SPI_exec(create_temp_table, 0) != SPI_OK_UTILITY) {
200 SPI_finish();
201 throw CircuitException("Failed to create temporary table");
202 }
203 drop_table = true;
204
205 constexpr size_t batch_size = 1000;
206
207 for (size_t offset = 0; offset < uuids.size(); offset += batch_size) {
208 StringInfoData insert_query;
209 initStringInfo(&insert_query);
210 appendStringInfo(&insert_query, "INSERT INTO tmp_uuids VALUES ");
211
212 size_t end = std::min(offset + batch_size, uuids.size());
213 for (size_t i = offset; i < end; ++i) {
214 appendStringInfo(&insert_query, "('%s')%s", uuids[i].c_str(), (i + 1 == end) ? "" : ",");
215 }
216
217 int retval=SPI_exec(insert_query.data, 0);
218 pfree(insert_query.data);
219
220 if(retval != SPI_OK_INSERT) {
221 SPI_exec(drop_temp_table, 0);
222 SPI_finish();
223 throw CircuitException("Batch insert into temp table failed");
224 }
225 }
226
227 appendStringInfo(&join_query,
228 "SELECT value, provenance FROM %s t JOIN tmp_uuids u ON t.provenance = u.id", qualified);
229 }
230
231 if (SPI_exec(join_query.data, 0) != SPI_OK_SELECT) {
232 if(drop_table)
233 SPI_exec(drop_temp_table, 0);
234 SPI_finish();
235 throw CircuitException("Join query failed");
236 }
237
238 return drop_table;
239}
240
241/**
242 * @brief Core implementation of compiled provenance evaluation.
243 * @param token UUID of the root provenance gate.
244 * @param table OID of the provenance mapping relation.
245 * @param semiring Name of the semiring to evaluate over.
246 * @param type OID of the result element type.
247 * @return Datum containing the semiring evaluation result.
248 */
250 (pg_uuid_t token, Oid table, const std::string &semiring, Oid type)
251{
252 constants_t constants = get_constants(true);
253
254 // boolexpr without a mapping has nothing to fetch from SPI: skip the
255 // GenericCircuit build + temp-UUID join and read the BooleanCircuit
256 // directly. Every other case (including boolexpr WITH a mapping)
257 // shares the prelude below.
258 if(semiring=="boolexpr" && !OidIsValid(table)) {
259 gate_t root;
260 BooleanCircuit bc = getBooleanCircuit(token, root);
261 return pec_boolexpr(bc, root, nullptr);
262 }
263
264 // The expectation evaluator reads its leaves directly from the
265 // gate_rv / gate_value extra fields; no leaf-mapping table is
266 // involved. Bypass the SPI prelude.
267 if(semiring=="expectation") {
268 if(type != constants.OID_TYPE_FLOAT)
269 throw CircuitException(
270 "Unknown element type for expectation: must be float8");
272 auto root = gc.getGate(uuid2string(token));
273 return Float8GetDatum(provsql::compute_expectation(gc, root));
274 }
275
277 auto g = c.getGate(uuid2string(token));
278 auto inputs = c.getInputs();
279
280 std::vector<std::string> inputs_uuid;
281 std::transform(inputs.begin(), inputs.end(), std::back_inserter(inputs_uuid), [&c](auto x) {
282 return c.getUUID(x);
283 });
284 bool drop_table = join_with_temp_uuids(table, inputs_uuid);
285
286 if(semiring=="boolexpr") {
287 // boolexpr with a mapping: build the gate-to-label map on the
288 // GenericCircuit (the mapping table is keyed by input UUIDs that
289 // don't survive translation to BooleanCircuit gate ids), then
290 // translate keys to bc gates via gc_to_bc so bc.toString can label
291 // each leaf with its mapped value.
292 gate_t root;
293 std::unordered_map<gate_t, std::string> gc_labels;
294 initialize_provenance_mapping<std::string>(constants, c, gc_labels, [](const char *v) {
295 return std::string(v);
296 }, drop_table);
297
298 std::unordered_map<gate_t, gate_t> gc_to_bc;
299 BooleanCircuit bc = getBooleanCircuit(c, token, root, gc_to_bc);
300
301 std::unordered_map<gate_t, std::string> bc_labels;
302 bc_labels.reserve(gc_labels.size());
303 for(const auto &kv : gc_labels) {
304 auto it = gc_to_bc.find(kv.first);
305 if(it != gc_to_bc.end())
306 bc_labels[it->second] = kv.second;
307 }
308 return pec_boolexpr(bc, root, &bc_labels);
309 }
310
311 if (type == constants.OID_TYPE_VARCHAR)
312 {
313 if (semiring == "formula") return pec(constants, c, g, semiring::Formula{}, drop_table);
314 if (semiring == "why") return pec(constants, c, g, semiring::Why{}, drop_table);
315 if (semiring == "how") return pec(constants, c, g, semiring::How{}, drop_table);
316 if (semiring == "which") return pec(constants, c, g, semiring::Which{}, drop_table);
317 throw CircuitException("Unknown semiring for type varchar: " + semiring);
318 }
319 if (type == constants.OID_TYPE_INT) {
320 if (semiring == "counting") return pec(constants, c, g, semiring::Counting{}, drop_table);
321 throw CircuitException("Unknown semiring for type int: " + semiring);
322 }
323 if (type == constants.OID_TYPE_FLOAT) {
324 if (semiring == "tropical") return pec(constants, c, g, semiring::Tropical{}, drop_table);
325 if (semiring == "viterbi") return pec(constants, c, g, semiring::Viterbi{}, drop_table);
326 if (semiring == "lukasiewicz") return pec(constants, c, g, semiring::Lukasiewicz{}, drop_table);
327 throw CircuitException("Unknown semiring for type float: " + semiring);
328 }
329 if (type == constants.OID_TYPE_BOOL) {
330 if (semiring == "boolean") return pec(constants, c, g, semiring::Boolean{}, drop_table);
331 throw CircuitException("Unknown semiring for type bool: " + semiring);
332 }
333#if PG_VERSION_NUM >= 140000
334 if (type == constants.OID_TYPE_TSTZMULTIRANGE) {
335 if (semiring != "temporal" && semiring != "interval_union")
336 throw CircuitException("Unknown semiring for type tstzmultirange: " + semiring);
337 return pec(constants, c, g, semiring::IntervalUnion(constants.OID_TYPE_TSTZMULTIRANGE), drop_table);
338 }
339 if (type == constants.OID_TYPE_NUMMULTIRANGE) {
340 if (semiring != "interval_union")
341 throw CircuitException("Unknown semiring for type nummultirange: " + semiring);
342 return pec(constants, c, g, semiring::IntervalUnion(constants.OID_TYPE_NUMMULTIRANGE), drop_table);
343 }
344 if (type == constants.OID_TYPE_INT4MULTIRANGE) {
345 if (semiring != "interval_union")
346 throw CircuitException("Unknown semiring for type int4multirange: " + semiring);
347 return pec(constants, c, g, semiring::IntervalUnion(constants.OID_TYPE_INT4MULTIRANGE), drop_table);
348 }
349#endif
350 if (get_typtype(type) == TYPTYPE_ENUM) {
351 if (semiring == "minmax") return pec(constants, c, g, semiring::MinMax(type, false), drop_table);
352 if (semiring == "maxmin") return pec(constants, c, g, semiring::MinMax(type, true), drop_table);
353 throw CircuitException("Unknown semiring for enum type: " + semiring);
354 }
355 throw CircuitException("Unknown element type for provenance_evaluate_compiled");
356}
357
358/** @brief PostgreSQL-callable wrapper for provenance_evaluate_compiled(). */
359Datum provenance_evaluate_compiled(PG_FUNCTION_ARGS)
360{
361 try {
362 Datum token = PG_GETARG_DATUM(0);
363
364 Oid table = PG_GETARG_OID(1);
365
366 text *t = PG_GETARG_TEXT_P(2);
367 std::string semiring(VARDATA(t),VARSIZE(t)-VARHDRSZ);
368
369 Oid type = get_fn_expr_argtype(fcinfo->flinfo, 3);
370
371 return provenance_evaluate_compiled_internal(*DatumGetUUIDP(token), table, semiring, type);
372 } catch(const std::exception &e) {
373 provsql_error("provenance_evaluate_compiled: %s", e.what());
374 } catch(...) {
375 provsql_error("provenance_evaluate_compiled: Unknown exception");
376 }
377
378 PG_RETURN_NULL();
379}
Boolean-expression (lineage formula) semiring.
Boolean semiring ({false, true}, ∨, ∧, false, true).
BooleanCircuit getBooleanCircuit(GenericCircuit &gc, pg_uuid_t token, gate_t &gate, std::unordered_map< gate_t, gate_t > &gc_to_bc)
Build a BooleanCircuit from an already-loaded GenericCircuit.
GenericCircuit getGenericCircuit(pg_uuid_t token)
Build a GenericCircuit from the mmap store rooted at token.
gate_t
Strongly-typed gate identifier.
Definition Circuit.h:49
Counting semiring (ℕ, +, ×, 0, 1).
Analytical expectation / variance / moment evaluator over RV circuits.
Symbolic representation of provenance as a human-readable formula.
How-provenance m-semiring (canonical polynomial provenance ℕ[X]).
Interval-union m-semiring over PostgreSQL multirange types.
Łukasiewicz fuzzy m-semiring over .
Min-max and max-min m-semirings over PostgreSQL enum types.
Tropical (min-plus) m-semiring over .
Viterbi (max-times) m-semiring over .
Which-provenance (lineage) m-semiring.
Why-provenance semiring (set of witness sets).
Boolean circuit for provenance formula evaluation.
virtual std::string toString(gate_t g) const override
Return a textual description of gate g for debugging.
Exception type thrown by circuit operations on invalid input.
Definition Circuit.h:206
gate_t getGate(const uuid &u)
Return (or create) the gate associated with UUID u.
Definition Circuit.hpp:33
In-memory provenance circuit with semiring-generic evaluation.
S::value_type evaluate(gate_t g, std::unordered_map< gate_t, typename S::value_type > &provenance_mapping, S semiring) const
Evaluate the sub-circuit rooted at gate g over semiring semiring.
const std::set< gate_t > & getInputs() const
Return the set of input (leaf) gates.
The Boolean semiring over bool.
Definition Boolean.h:41
The counting semiring over unsigned.
Definition Counting.h:42
Symbolic provenance representation over std::string.
Definition Formula.h:111
value_type parse_leaf(const char *v) const
Definition Formula.h:269
std::string to_text(const value_type &s) const
Serialise a Formula evaluation as text.
Definition Formula.h:289
How-provenance m-semiring over .
Definition How.h:72
Interval-union m-semiring with Datum carrier, parameterised by a multirange type OID.
The Łukasiewicz fuzzy m-semiring over double.
Definition Lukasiewicz.h:58
Min-max / max-min m-semiring with Datum carrier over a PostgreSQL enum type.
Definition MinMax.h:77
Tropical (min-plus) m-semiring over double.
Definition Tropical.h:52
The Viterbi (max-times) m-semiring over double.
Definition Viterbi.h:50
Which-provenance (lineage) semiring.
Definition Which.h:58
Why-provenance semiring.
Definition Why.h:56
Provenance evaluation helper for HAVING-clause circuits.
void provsql_having(GenericCircuit &c, gate_t g, MapT &mapping, SemiringT S=SemiringT{})
Rewrite HAVING comparison gates in the circuit by enumerating possible worlds.
double compute_expectation(const GenericCircuit &gc, gate_t root, std::optional< gate_t > event_root)
Compute (or if event_root is set) over the scalar sub-circuit rooted at root.
std::map< how_monomial_t, unsigned > how_provenance_t
How-provenance value: each monomial mapped to its (positive) coefficient.
Definition How.h:62
std::set< label_set > why_provenance_t
Why-provenance value: the full set of all witnesses.
Definition Why.h:49
std::optional< std::set< std::string > > which_provenance_t
Which-provenance value: a set of labels, or (empty optional).
Definition Which.h:50
static Datum provenance_evaluate_compiled_internal(pg_uuid_t token, Oid table, const std::string &semiring, Oid type)
Core implementation of compiled provenance evaluation.
const char * drop_temp_table
DROP TABLE statement for the per-query temporary provenance mapping table.
bool join_with_temp_uuids(Oid table, const std::vector< std::string > &uuids)
Join a provenance mapping table with a set of UUIDs using SPI.
Datum provenance_evaluate_compiled(PG_FUNCTION_ARGS)
PostgreSQL-callable wrapper for provenance_evaluate_compiled().
Template helper for populating provenance mappings from SPI results.
void initialize_provenance_mapping(const constants_t &constants, GenericCircuit &c, std::unordered_map< gate_t, T > &provenance_mapping, const std::function< T(const char *)> &charp_to_value, bool drop_table)
Populate a provenance mapping from the current SPI result set.
#define provsql_error(fmt,...)
Report a fatal ProvSQL error and abort the current transaction.
constants_t get_constants(bool failure_if_not_possible)
Retrieve the cached OID constants for the current database.
Core types, constants, and utilities shared across ProvSQL.
string uuid2string(pg_uuid_t uuid)
Format a pg_uuid_t as a std::string.
Structure to store the value of various constants.
Oid OID_TYPE_VARCHAR
OID of the VARCHAR TYPE.
Oid OID_TYPE_FLOAT
OID of the FLOAT TYPE.
Oid OID_TYPE_INT
OID of the INT TYPE.
Oid OID_TYPE_TSTZMULTIRANGE
OID of the tstzmultirange TYPE (PG14+, InvalidOid otherwise).
Oid OID_TYPE_BOOL
OID of the BOOL TYPE.
Oid OID_TYPE_NUMMULTIRANGE
OID of the nummultirange TYPE (PG14+, InvalidOid otherwise).
Oid OID_TYPE_INT4MULTIRANGE
OID of the int4multirange TYPE (PG14+, InvalidOid otherwise).
UUID structure.