ProvSQL SQL API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
provsql.sql
Go to the documentation of this file.
1/**
2 * @file
3 * @brief ProvSQL PL/pgSQL extension code
4 *
5 * This file contains the PL/pgSQL code of the ProvSQL extension. This
6 * extension requires the standard UUID-ossp extension.
7 */
8
9/**
10 * @brief <tt>provsql</tt> schema
11 *
12 * All types and functions introduced by ProvSQL are defined in the
13 * provsql schema, requiring prefixing them by <tt>provsql.</tt> or
14 * using PostgreSQL's <tt>search_path</tt> variable with a command such
15 * as \code{.sql}SET search_path TO public, provsql;\endcode
16 */
17CREATE SCHEMA provsql;
18
19SET search_path TO provsql;
20
21/**
22 * @brief Provenance circuit gate types
23 *
24 * Each gate in the provenance circuit has a type that determines
25 * its semantics during semiring evaluation.
26 */
27CREATE TYPE PROVENANCE_GATE AS
28 ENUM(
29 'input', -- Input (variable) gate of the circuit
30 'plus', -- Semiring plus
31 'times', -- Semiring times
32 'monus', -- M-Semiring monus
33 'project', -- Project gate (for where provenance)
34 'zero', -- Semiring zero
35 'one', -- Semiring one
36 'eq', -- Equijoin gate (for where provenance)
37 'agg', -- Aggregation operator (for aggregate provenance)
38 'semimod', -- Semimodule scalar multiplication (for aggregate provenance)
39 'cmp', -- Comparison of aggregate values (HAVING-clause provenance)
40 'delta', -- δ-semiring operator (see Amsterdamer, Deutch, Tannen, PODS 2011)
41 'value', -- Scalar value (for aggregate provenance)
42 'mulinput',-- Multivalued input (for Boolean provenance)
43 'update', -- Update operation
44 'rv', -- Continuous random-variable leaf
45 'arith', -- n-ary arithmetic gate over scalar-valued children
46 'mixture', -- Probabilistic mixture of two scalar RV roots with a Bernoulli weight
47 'assumed_boolean' -- Structural marker over a single child: the
48 -- wrapped sub-circuit was computed under a
49 -- Boolean-provenance assumption (e.g. the safe-
50 -- query rewrite). Transparent for Boolean-
51 -- compatible evaluators, fatal error for the
52 -- rest, rendered as an explicit element in
53 -- PROV-XML export.
54 );
56/** @defgroup gate_manipulation Circuit gate manipulation
57 * Low-level functions for creating and querying provenance circuit gates.
58 * @{
59 */
61/**
62 * @brief Create a new gate in the provenance circuit
63 *
64 * @param token UUID identifying the new gate
65 * @param type gate type (see PROVENANCE_GATE)
66 * @param children optional array of child gate UUIDs
67 */
68CREATE OR REPLACE FUNCTION create_gate(
69 token UUID,
70 type PROVENANCE_GATE,
71 children UUID[] DEFAULT NULL)
72 RETURNS VOID AS
73 'provsql','create_gate' LANGUAGE C PARALLEL SAFE;
74/**
75 * @brief Return the gate type of a provenance token
76 *
77 * Returns @c 'input' for any token not yet materialized in the circuit,
78 * since input is the default semantics of an unmaterialized provenance token.
79 */
80CREATE OR REPLACE FUNCTION get_gate_type(
81 token UUID)
82 RETURNS PROVENANCE_GATE AS
83 'provsql','get_gate_type' LANGUAGE C IMMUTABLE PARALLEL SAFE;
84/** @brief Return the children of a provenance gate */
85CREATE OR REPLACE FUNCTION get_children(
86 token UUID)
87 RETURNS UUID[] AS
88 'provsql','get_children' LANGUAGE C IMMUTABLE PARALLEL SAFE;
89/**
90 * @brief Set the probability of an input gate
91 *
92 * @param token UUID of the input gate
93 * @param p probability value in [0,1]
94 */
95CREATE OR REPLACE FUNCTION set_prob(
96 token UUID, p DOUBLE PRECISION)
97 RETURNS VOID AS
98 'provsql','set_prob' LANGUAGE C PARALLEL SAFE;
99/** @brief Get the probability associated with an input gate */
100CREATE OR REPLACE FUNCTION get_prob(
101 token UUID)
102 RETURNS DOUBLE PRECISION AS
103 'provsql','get_prob' LANGUAGE C STABLE PARALLEL SAFE;
104
105/**
106 * @brief Set additional INTEGER values on provenance circuit gate
107 *
108 * This function sets two INTEGER values associated to a circuit gate, used in
109 * different ways by different gate types:
110 * - for mulinput, info1 indicates the value of this multivalued variable
111 * - for eq, info1 and info2 indicate the attribute index of the
112 equijoin in, respectively, the first and second columns
113 * - for agg, info1 is the oid of the aggregate function and info2 the
114 oid of the aggregate result type
115 * - for cmp, info1 is the oid of the comparison operator
116 *
117 * @param token UUID of the circuit gate
118 * @param info1 first INTEGER value
119 * @param info2 second INTEGER value
120 */
121CREATE OR REPLACE FUNCTION set_infos(
122 token UUID, info1 INT, info2 INT DEFAULT NULL)
123 RETURNS VOID AS
124 'provsql','set_infos' LANGUAGE C PARALLEL SAFE;
125
126/** @brief Get the INTEGER info values associated with a circuit gate */
127CREATE OR REPLACE FUNCTION get_infos(
128 token UUID, OUT info1 INT, OUT info2 INT)
129 RETURNS RECORD AS
130 'provsql','get_infos' LANGUAGE C STABLE PARALLEL SAFE;
131
132/**
133 * @brief Wrap @p token in a fresh @c gate_assumed_boolean and return
134 * the wrapper's UUID.
135 *
136 * Public primitive callable from any rewrite that needs to flag a
137 * sub-circuit as having been computed under a Boolean-provenance
138 * assumption (the safe-query rewriter is the first caller; future
139 * Boolean-only simplifications should reuse this). The wrapper is
140 * transparent for Boolean-compatible evaluators (probability, the
141 * @c sr_boolean / @c sr_boolexpr / @c sr_formula / @c sr_interval_*
142 * family, where-provenance) and raises a @c CircuitException for the
143 * rest. Always kept as an explicit node in PROV-XML export.
144 *
145 * The wrapper UUID is content-derived via @c uuid_generate_v5 on the
146 * child, so identical children always wrap to the same outer UUID
147 * (and distinct children always wrap to distinct outer UUIDs).
148 * No-op (returns NULL) on a NULL input.
149 */
150CREATE OR REPLACE FUNCTION assume_boolean(token UUID) RETURNS UUID AS
151$$
152DECLARE
153 wrapped UUID;
154BEGIN
155 IF token IS NULL THEN
156 RETURN NULL;
157 END IF;
158 wrapped := public.uuid_generate_v5(uuid_ns_provsql(),
159 concat('assumed_boolean', token));
160 PERFORM create_gate(wrapped, 'assumed_boolean', ARRAY[token]);
161 RETURN wrapped;
162END
163$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public
164 SECURITY DEFINER PARALLEL SAFE;
165
166/**
167 * @brief Set extra TEXT information on provenance circuit gate
168 *
169 * This function sets TEXT-encoded data associated to a circuit gate, used in
170 * different ways by different gate types:
171 * - for project, it is a TEXT-encoded ARRAY of two-element ARRAYs that
172 * indicate mappings between input attribute (first element) and output
173 * attribute (second element)
174 * - for value and agg, it is the TEXT-encoded (base for value, computed
175 * for agg) scalar value
176 *
177 * @param token UUID of the circuit gate
178 * @param data TEXT-encoded information
179 */
180CREATE OR REPLACE FUNCTION set_extra(
181 token UUID, data TEXT)
182 RETURNS VOID AS
183 'provsql','set_extra' LANGUAGE C PARALLEL SAFE STRICT;
184/** @brief Get the TEXT-encoded extra data associated with a circuit gate */
185CREATE OR REPLACE FUNCTION get_extra(token UUID)
186 RETURNS TEXT AS
187 'provsql','get_extra' LANGUAGE C STABLE PARALLEL SAFE RETURNS NULL ON NULL INPUT;
188
189/**
190 * @brief Return the total number of materialized gates in the provenance circuit
191 *
192 * Input gates for provenance-tracked table rows are created lazily on
193 * first reference; rows that have never appeared in a query result are
194 * not counted.
195 */
196CREATE OR REPLACE FUNCTION get_nb_gates() RETURNS BIGINT AS
197 'provsql', 'get_nb_gates' LANGUAGE C PARALLEL SAFE;
198
199/** @} */
200
201/** @defgroup table_management Provenance table management
202 * Functions for enabling, disabling, and configuring provenance
203 * tracking on user tables.
204 * @{
205 */
206
207
208/**
209 * @brief Trigger function for DELETE statement provenance tracking
210 *
211 * Records the deletion and applies monus to provenance tokens of
212 * deleted rows. This is the version for PostgreSQL < 14.
213 */
214CREATE OR REPLACE FUNCTION delete_statement_trigger()
215 RETURNS TRIGGER AS
217DECLARE
218 query_text TEXT;
219 delete_token UUID;
220 old_token UUID;
221 new_token UUID;
222 r RECORD;
223BEGIN
224 delete_token := public.uuid_generate_v4();
225
226 PERFORM create_gate(delete_token, 'input');
227
228 SELECT query
229 INTO query_text
230 FROM pg_stat_activity
231 WHERE pid = pg_backend_pid();
232
233 INSERT INTO delete_provenance (delete_token, query, deleted_by, deleted_at)
234 VALUES (delete_token, query_text, current_user, CURRENT_TIMESTAMP);
235
236 EXECUTE format('INSERT INTO %I.%I SELECT * FROM OLD_TABLE;', TG_TABLE_SCHEMA, TG_TABLE_NAME);
237
238 FOR r IN (SELECT * FROM OLD_TABLE) LOOP
239 old_token := r.provsql;
240 new_token := provenance_monus(old_token, delete_token);
241
242 EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2;', TG_TABLE_SCHEMA, TG_TABLE_NAME)
243 USING new_token, old_token;
244 END LOOP;
245
246 RETURN NULL;
247END
248$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp SECURITY DEFINER;
250
251/**
252 * @brief Record per-relation provenance metadata used by the
253 * safe-query optimisation.
254 *
255 * Stores a @c (relid, kind, block_key) RECORD in the persistent
256 * mmap-backed table-info store. @p kind is one of:
257 * - @c 'tid' -- independent input leaves (post-@c add_provenance default)
258 * - @c 'bid' -- block-correlated leaves; rows sharing the same value
259 * of @p block_key are mutually exclusive. An empty
260 * @p block_key means the whole table is one block.
261 * - @c 'opaque' -- arbitrary correlations from a derived source
262 * (CREATE TABLE AS SELECT, INSERT INTO SELECT,
263 * UPDATE under provsql.update_provenance); the
264 * safe-query rewriter must bail on these.
265 *
266 * @param relid pg_class OID of the relation.
267 * @param kind One of @c 'tid' / @c 'bid' / @c 'opaque'.
268 * @param block_key Block-key column numbers (only meaningful for
269 * @c 'bid'; ignored otherwise but conventionally
270 * passed empty).
271 */
272CREATE OR REPLACE FUNCTION set_table_info(
273 relid OID, kind TEXT, block_key INT2[] DEFAULT ARRAY[]::INT2[])
274 RETURNS VOID AS
275 'provsql','set_table_info' LANGUAGE C PARALLEL SAFE;
276
277/** @brief Remove per-relation provenance metadata. No-op when missing. */
278CREATE OR REPLACE FUNCTION remove_table_info(relid OID)
279 RETURNS VOID AS
280 'provsql','remove_table_info' LANGUAGE C PARALLEL SAFE;
281
282/**
283 * @brief Read per-relation provenance metadata.
285 * Returns NULL if no RECORD exists. @c kind is one of @c 'tid' /
286 * @c 'bid' / @c 'opaque'; @c block_key is the (possibly empty) array
287 * of block-key column numbers, only meaningful when @c kind = @c 'bid'.
288 * Used by the planner-time hierarchy detector to gate the safe-query
289 * rewrite.
290 */
291CREATE OR REPLACE FUNCTION get_table_info(
292 relid OID, OUT kind TEXT, OUT block_key INT2[])
293 RETURNS RECORD AS
294 'provsql','get_table_info' LANGUAGE C STABLE PARALLEL SAFE;
295
296/**
297 * @brief Record the base-relation ancestor set of a tracked relation.
298 *
299 * Base tables created with @c add_provenance / @c repair_key carry
300 * @c {self}; CTAS-derived tables inherit the union of their sources'
301 * ancestor sets. The safe-query rewriter consults the registry to
302 * enforce that joined FROM entries have disjoint base ancestors
303 * before firing the read-once factoring.
304 *
305 * The worker preserves the relation's existing @c kind / @c block_key
306 * half on update; it silently no-ops when no kind RECORD exists for
307 * @p relid (callers should run @c add_provenance / @c repair_key
308 * first). The ancestor list is capped at 64 entries (clear error if
309 * exceeded).
310 *
311 * @param relid pg_class OID of the relation.
312 * @param ancestors Sorted, deduplicated base-relation OIDs.
313 */
314CREATE OR REPLACE FUNCTION set_ancestors(
315 relid OID, ancestors OID[] DEFAULT ARRAY[]::OID[])
316 RETURNS VOID AS
317 'provsql','set_ancestors' LANGUAGE C PARALLEL SAFE;
318
319/** @brief Clear the ancestor half of a per-relation RECORD (keeps kind/block_key).
320 * No-op when missing. */
321CREATE OR REPLACE FUNCTION remove_ancestors(relid OID)
322 RETURNS VOID AS
323 'provsql','remove_ancestors' LANGUAGE C PARALLEL SAFE;
324
325/**
326 * @brief Read the base-relation ancestor set of a tracked relation.
327 *
328 * Returns @c NULL when no ancestor RECORD exists for @p relid (or the
329 * RECORD is empty -- both cases make the safe-query rewriter take
330 * its conservative refuse path, so they collapse here).
331 */
332CREATE OR REPLACE FUNCTION get_ancestors(relid OID)
333 RETURNS OID[] AS
334 'provsql','get_ancestors' LANGUAGE C STABLE PARALLEL SAFE;
335
336/**
337 * @brief BEFORE INSERT OR UPDATE OF provsql row trigger installed by
338 * @c add_provenance.
339 *
340 * Two jobs:
341 *
342 * 1. Fill @c NEW.provsql with a fresh @c uuid_generate_v4 leaf when
343 * the user did not supply one (this replaces the column DEFAULT
344 * that @c add_provenance used to install: a real DEFAULT would
345 * fire before the trigger sees the row, so we could not tell
346 * "user omitted the column" from "user supplied a value").
347 * 2. When the user does supply a non-NULL @c provsql on @c INSERT,
348 * or changes it on @c UPDATE, flip the table's per-table
349 * metadata to @c OPAQUE. The user is free to write whatever
350 * UUIDs they want (cross-table reuse, compound tokens minted
351 * via @c create_gate, ...); the cost is that the safe-query
352 * rewriter then refuses to fire on this table, because TID
353 * independence can no longer be assumed.
354 */
355CREATE OR REPLACE FUNCTION provenance_guard()
356 RETURNS TRIGGER AS $$
357BEGIN
358 IF TG_OP = 'INSERT' THEN
359 IF NEW.provsql IS NULL THEN
360 NEW.provsql := public.uuid_generate_v4();
361 ELSE
362 PERFORM provsql.set_table_info(TG_RELID, 'opaque');
363 END IF;
364 ELSIF TG_OP = 'UPDATE' THEN
365 IF NEW.provsql IS DISTINCT FROM OLD.provsql THEN
366 PERFORM provsql.set_table_info(TG_RELID, 'opaque');
367 END IF;
368 END IF;
369 RETURN NEW;
370END;
371$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public
372 SECURITY DEFINER;
374/**
375 * @brief Enable provenance tracking on an existing table
376 *
377 * Adds a <tt>provsql</tt> UUID column to the table, an index for
378 * fast UUID-keyed lookups, and a BEFORE INSERT/UPDATE row trigger
379 * (@c provenance_guard) that mints a fresh @c uuid_generate_v4
380 * leaf when the user omits the column on INSERT, or flips the
381 * table's metadata to @c OPAQUE when the user supplies their own
382 * value. Input gates for existing rows are created lazily when
383 * first referenced by a query.
384 *
385 * @param _tbl the table to add provenance tracking to
386 */
387CREATE OR REPLACE FUNCTION add_provenance(_tbl REGCLASS)
388 RETURNS VOID AS
389$$
390BEGIN
391 -- No DEFAULT: the guard trigger mints the UUID, so the trigger can
392 -- distinguish "user omitted" (NULL) from "user supplied a value".
393 -- No UNIQUE: we no longer rely on it to keep the table TID -- the
394 -- guard does that semantically -- and a UNIQUE would reject the
395 -- legitimate cross-table UUID copy that just flips the table to
396 -- OPAQUE. We keep a plain index for fast UUID-keyed lookups.
397 EXECUTE format('ALTER TABLE %s ADD COLUMN provsql UUID', _tbl);
398 EXECUTE format(
399 'UPDATE %s SET provsql = public.uuid_generate_v4() WHERE provsql IS NULL',
400 _tbl);
401 EXECUTE format('CREATE INDEX ON %s(provsql)', _tbl);
402 EXECUTE format(
403 'CREATE TRIGGER provenance_guard BEFORE INSERT OR UPDATE OF provsql '
404 'ON %s FOR EACH ROW EXECUTE PROCEDURE provsql.provenance_guard()',
405 _tbl);
406 PERFORM provsql.set_table_info(_tbl::oid, 'tid');
407 -- Seed the base-ancestor set to {self}: a base TID table's atoms
408 -- come from itself and no other relation. CTAS-derived tables
409 -- inherit unions of source ancestor sets; that is handled by the
410 -- CTAS hook (a separate slice), not here.
411 PERFORM provsql.set_ancestors(_tbl::oid, ARRAY[_tbl::oid]);
412END
413$$ LANGUAGE plpgsql SECURITY DEFINER;
414
415/**
416 * @brief Remove provenance tracking from a table
417 *
418 * Drops the <tt>provsql</tt> column and associated triggers.
419 *
420 * @param _tbl the table to remove provenance tracking from
421 */
422CREATE OR REPLACE FUNCTION remove_provenance(_tbl REGCLASS)
423 RETURNS VOID AS
424$$
425DECLARE
426BEGIN
427 PERFORM provsql.remove_table_info(_tbl::oid);
428 -- Drop the BEFORE INSERT/UPDATE guard first: it has a column
429 -- dependency on provsql (via the OF provsql clause), so the
430 -- subsequent DROP COLUMN would otherwise raise.
431 BEGIN
432 EXECUTE format('DROP TRIGGER provenance_guard on %s', _tbl);
433 EXCEPTION WHEN undefined_object THEN
434 END;
435 EXECUTE format('ALTER TABLE %s DROP COLUMN provsql', _tbl);
436 BEGIN
437 EXECUTE format('DROP TRIGGER add_gate on %s', _tbl);
438 EXCEPTION WHEN undefined_object THEN
439 END;
440 BEGIN
441 EXECUTE format('DROP TRIGGER insert_statement on %s', _tbl);
442 EXECUTE format('DROP TRIGGER update_statement on %s', _tbl);
443 EXECUTE format('DROP TRIGGER delete_statement on %s', _tbl);
444 EXCEPTION WHEN undefined_object THEN
445 END;
446END
447$$ LANGUAGE plpgsql;
448
449/**
450 * @brief Set up provenance for a table with duplicate key values
451 *
452 * When a table has duplicate rows for a given key, this function
453 * replaces simple input gates with multivalued input (mulinput) gates
454 * that model a uniform distribution over duplicates.
455 *
456 * @param _tbl the table to repair
457 * @param key_att the key attribute(s) as a comma-separated string, or
458 * empty string if the whole table is one group
459 */
460CREATE OR REPLACE FUNCTION repair_key(_tbl REGCLASS, key_att TEXT)
461 RETURNS VOID AS
463DECLARE
464 r RECORD;
465 rows_query TEXT;
466 block_key_cols INT2[];
467BEGIN
468 -- Resolve the (possibly comma-separated) key_att TEXT into the
469 -- corresponding pg_attribute.attnum values for the safe-query
470 -- metadata. Names are trimmed; quoting is not supported because
471 -- repair_key has never accepted quoted identifiers in key_att.
472 IF key_att = '' THEN
473 block_key_cols := ARRAY[]::INT2[];
474 ELSE
475 SELECT array_agg(a.attnum ORDER BY t.ord)::INT2[]
476 INTO block_key_cols
477 FROM unnest(string_to[](key_att, ',')) WITH ORDINALITY AS t(name, ord)
478 JOIN pg_attribute a
479 ON a.attrelid = _tbl
480 AND a.attname = trim(t.name)
481 AND a.attnum > 0
482 AND NOT a.attisdropped;
483 IF block_key_cols IS NULL OR array_length(block_key_cols, 1) IS NULL THEN
484 RAISE EXCEPTION 'repair_key: could not resolve key columns from "%"', key_att;
485 END IF;
486 IF array_length(block_key_cols, 1) > 16 THEN
487 RAISE EXCEPTION 'repair_key: block key wider than 16 columns is not supported';
488 END IF;
489 END IF;
491 -- Same column shape as add_provenance: no UNIQUE, no DEFAULT past
492 -- the initial backfill (the guard trigger added after the rename
493 -- takes over both jobs once the column has been renamed to its
494 -- final name). The DEFAULT is kept here only so the second pass
495 -- below can read provsql_temp from the user-visible rows
496 -- without a separate UPDATE.
497 EXECUTE format('ALTER TABLE %s ADD COLUMN provsql_temp UUID DEFAULT public.uuid_generate_v4()', _tbl);
498
499 -- Build a per-group mapping (key columns + a fresh key_token + the
500 -- group size) once, then use it for both the create_gate(key_token,
501 -- 'input') first pass and the per-row mulinput second pass. Going
502 -- through a temp table avoids re-running uuid_generate_v4() (which
503 -- would produce different UUIDs the second time). USING (%1$s) on
504 -- the second pass handles the multi-column case uniformly.
505 -- ON COMMIT DROP plus the explicit DROP TABLE at the end of this
506 -- function leave the temp table cleaned up across transactions and
507 -- across repeated calls in the same transaction.
508 IF key_att = '' THEN
509 EXECUTE format(
510 'CREATE TEMP TABLE provsql_repair_key_tmp ON COMMIT DROP AS
511 SELECT public.uuid_generate_v4() AS provsql_key_token,
512 COUNT(*) AS provsql_group_size
513 FROM %s', _tbl);
514 rows_query := format(
515 'SELECT t.provsql_temp,
516 k.provsql_key_token AS key_token,
517 ROW_NUMBER() OVER (ORDER BY t.ctid) AS within_group,
518 k.provsql_group_size AS group_size
519 FROM %s t CROSS JOIN provsql_repair_key_tmp k', _tbl);
520 ELSE
521 EXECUTE format(
522 'CREATE TEMP TABLE provsql_repair_key_tmp ON COMMIT DROP AS
523 SELECT %1$s,
524 public.uuid_generate_v4() AS provsql_key_token,
525 COUNT(*) AS provsql_group_size
526 FROM %2$s
527 GROUP BY %1$s', key_att, _tbl);
528 rows_query := format(
529 'SELECT t.provsql_temp,
530 k.provsql_key_token AS key_token,
531 ROW_NUMBER() OVER (PARTITION BY k.provsql_key_token
532 ORDER BY t.ctid) AS within_group,
533 k.provsql_group_size AS group_size
534 FROM %2$s t
535 JOIN provsql_repair_key_tmp k USING (%1$s)', key_att, _tbl);
536 END IF;
537
538 -- Pass 1: one input gate per group key.
539 FOR r IN SELECT provsql_key_token FROM provsql_repair_key_tmp LOOP
540 PERFORM provsql.create_gate(r.provsql_key_token, 'input');
541 END LOOP;
542
543 -- Pass 2: per row, attach a mulinput gate to its group's key token.
544 FOR r IN EXECUTE rows_query LOOP
545 PERFORM provsql.create_gate(r.provsql_temp, 'mulinput', ARRAY[r.key_token]);
546 PERFORM provsql.set_prob(r.provsql_temp, 1./r.group_size);
547 PERFORM provsql.set_infos(r.provsql_temp, r.within_group::INT);
548 END LOOP;
549
550 DROP TABLE provsql_repair_key_tmp;
551
552 EXECUTE format('ALTER TABLE %s ALTER COLUMN provsql_temp DROP DEFAULT', _tbl);
553 EXECUTE format('ALTER TABLE %s RENAME COLUMN provsql_temp TO provsql', _tbl);
554 EXECUTE format('CREATE INDEX ON %s(provsql)', _tbl);
555 EXECUTE format(
556 'CREATE TRIGGER provenance_guard BEFORE INSERT OR UPDATE OF provsql '
557 'ON %s FOR EACH ROW EXECUTE PROCEDURE provsql.provenance_guard()',
558 _tbl);
559 PERFORM provsql.set_table_info(_tbl::oid, 'bid', block_key_cols);
560 -- Base BID tables also have themselves as their sole ancestor. Same
561 -- rationale as the @c add_provenance branch above.
562 PERFORM provsql.set_ancestors(_tbl::oid, ARRAY[_tbl::oid]);
563END
564$$ LANGUAGE plpgsql;
565
566/**
567 * @brief Event trigger that purges per-table provenance metadata when
568 * a tracked relation is dropped outside of remove_provenance().
569 *
570 * Plain DROP TABLE bypasses remove_provenance() and would otherwise
571 * leave a stale entry in the table-info store keyed by a now-recycled
572 * OID, with confusing consequences for the safe-query rewriter the
573 * next time the OID is reused. This trigger forwards every dropped
574 * relation OID to provsql.remove_table_info(), which is a no-op for
575 * relations that were not tracked.
576 */
577CREATE OR REPLACE FUNCTION cleanup_table_info()
578 RETURNS event_trigger AS
579$$
580DECLARE
581 r RECORD;
582BEGIN
583 FOR r IN
584 SELECT objid FROM pg_event_trigger_dropped_objects()
585 WHERE object_type IN ('table', 'foreign table', 'materialized view')
586 LOOP
587 PERFORM provsql.remove_table_info(r.objid);
588 END LOOP;
589END
590$$ LANGUAGE plpgsql;
591
592DROP EVENT TRIGGER IF EXISTS provsql_cleanup_table_info;
593-- @c EXECUTE @c PROCEDURE (rather than the PG 11+ @c EXECUTE
594-- @c FUNCTION alias) so the extension installs on PG 10 too.
595CREATE EVENT TRIGGER provsql_cleanup_table_info ON sql_drop
596 EXECUTE PROCEDURE provsql.cleanup_table_info();
597
598/**
599 * @brief Create a provenance mapping table from an attribute
600 *
601 * Creates a new table mapping provenance tokens to values of a given
602 * attribute, for use with semiring evaluation functions.
603 *
604 * @param newtbl name of the mapping table to create
605 * @param oldtbl source table with provenance tracking
606 * @param att attribute whose values populate the mapping
607 * @param preserve_case if true, quote the table name to preserve case
608 */
609CREATE OR REPLACE FUNCTION create_provenance_mapping(
610 newtbl TEXT,
611 oldtbl REGCLASS,
612 att TEXT,
613 preserve_case BOOL DEFAULT 'f'
614) RETURNS VOID AS
615$$
616DECLARE
617BEGIN
618 EXECUTE format('CREATE TEMP TABLE tmp_provsql ON COMMIT DROP AS TABLE %s', oldtbl);
619 ALTER TABLE tmp_provsql RENAME provsql TO provenance;
620 IF preserve_case THEN
621 EXECUTE format('CREATE TABLE %I AS SELECT %s AS value, provenance FROM tmp_provsql', newtbl, att);
622 EXECUTE format('CREATE INDEX ON %I(provenance)', newtbl);
623 ELSE
624 EXECUTE format('CREATE TABLE %s AS SELECT %s AS value, provenance FROM tmp_provsql', newtbl, att);
625 EXECUTE format('CREATE INDEX ON %s(provenance)', newtbl);
626 END IF;
627END
628$$ LANGUAGE plpgsql;
629
630/**
631 * @brief Create a view mapping provenance tokens to attribute values
632 *
633 * Like create_provenance_mapping but creates a view instead of a table,
634 * so it always reflects the current state of the source table.
635 *
636 * @param newview name of the view to create
637 * @param oldtbl source table with provenance tracking
638 * @param att attribute whose values populate the mapping
639 * @param preserve_case if true, quote the view name to preserve case
640 */
641CREATE OR REPLACE FUNCTION create_provenance_mapping_view(
642 newview TEXT,
643 oldtbl REGCLASS,
644 att TEXT,
645 preserve_case BOOL DEFAULT false
647RETURNS VOID
648LANGUAGE plpgsql
649AS
650$$
651BEGIN
652 IF preserve_case THEN
653 EXECUTE format(
654 'CREATE OR REPLACE VIEW %I AS SELECT %s AS value, provsql AS provenance FROM %s',
655 newview,
656 att,
657 oldtbl
658 );
659 ELSE
660 EXECUTE format(
661 'CREATE OR REPLACE VIEW %s AS SELECT %s AS value, provsql AS provenance FROM %s',
662 newview,
663 att,
664 oldtbl
665 );
666 END IF;
667END;
668$$;
669
670/** @} */
671
672/** @defgroup internal_constants Internal constants
673 * UUID namespace and identity element functions used for
674 * deterministic gate generation.
675 * @{
676 */
677
678/** @brief Return the ProvSQL UUID namespace (used for deterministic gate UUIDs) */
679CREATE OR REPLACE FUNCTION uuid_ns_provsql() RETURNS UUID AS
680$$
681 -- uuid_generate_v5(uuid_ns_url(),'http://pierre.senellart.com/software/provsql/')
682 SELECT '920d4f02-8718-5319-9532-d4ab83a64489'::UUID
683$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
684
685/** @brief Return the UUID of the semiring zero gate */
686CREATE OR REPLACE FUNCTION gate_zero() RETURNS UUID AS
687$$
688 SELECT public.uuid_generate_v5(provsql.uuid_ns_provsql(),'zero');
689$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
690
691/** @brief Return the UUID of the semiring one gate */
692CREATE OR REPLACE FUNCTION gate_one() RETURNS UUID AS
693$$
694 SELECT public.uuid_generate_v5(provsql.uuid_ns_provsql(),'one');
695$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
696
697/** @brief Return the epsilon threshold used for probability comparisons */
698CREATE OR REPLACE FUNCTION epsilon() RETURNS DOUBLE PRECISION AS
699$$
700 SELECT CAST(0.001 AS DOUBLE PRECISION)
701$$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE;
702
703/** @} */
704
705/** @defgroup semiring_operations Semiring operations
706 * Functions that build provenance circuit gates for semiring operations.
707 * These are called internally by the query rewriter.
708 * @{
709 */
710
711/**
712 * @brief Create a times (product) gate from multiple provenance tokens
713 *
714 * Filters out NULL and one-gates; returns gate_one() if all tokens
715 * are trivial, or a single token if only one remains.
716 */
717CREATE OR REPLACE FUNCTION provenance_times(VARIADIC tokens UUID[])
718 RETURNS UUID AS
719$$
720DECLARE
721 times_token UUID;
722 filtered_tokens UUID[];
723BEGIN
724 SELECT array_agg(t) FROM unnest(tokens) t WHERE t IS NOT NULL AND t <> gate_one() INTO filtered_tokens;
725
726 -- Dispatch on the FILTERED count: a single survivor short-circuits
727 -- to that token directly (no useless single-child times gate); zero
728 -- survivors collapse to the identity. Using array_length(tokens, 1)
729 -- here would miss the [one, cmp] → [cmp] case, leaving the cmp wrapped
730 -- in a one-child times when its only sibling was gate_one().
731 CASE coalesce(array_length(filtered_tokens, 1), 0)
732 WHEN 0 THEN
733 times_token:=gate_one();
734 WHEN 1 THEN
735 times_token:=filtered_tokens[1];
736 ELSE
737 times_token := uuid_generate_v5(uuid_ns_provsql(),concat('times',filtered_tokens));
738
739 PERFORM create_gate(times_token, 'times', ARRAY_AGG(t)) FROM UNNEST(filtered_tokens) AS t WHERE t IS NOT NULL;
740 END CASE;
741
742 RETURN times_token;
743END
744$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE;
745
746/**
747 * @brief Create a monus (difference) gate from two provenance tokens
748 *
749 * Implements m-semiring monus. Returns token1 if token2 is NULL
750 * (used for LEFT OUTER JOIN semantics in the EXCEPT rewriting).
751 */
752CREATE OR REPLACE FUNCTION provenance_monus(token1 UUID, token2 UUID)
753 RETURNS UUID AS
754$$
755DECLARE
756 monus_token UUID;
757BEGIN
758 IF token1 IS NULL THEN
759 RAISE EXCEPTION USING MESSAGE='provenance_monus is called with first argument NULL';
760 END IF;
761
762 IF token2 IS NULL THEN
763 -- Special semantics, because of a LEFT OUTER JOIN used by the
764 -- difference operator: token2 NULL means there is no second argument
765 RETURN token1;
766 END IF;
767
768 IF token1 = token2 THEN
769 -- X-X=0
770 monus_token:=gate_zero();
771 ELSIF token1 = gate_zero() THEN
772 -- 0-X=0
773 monus_token:=gate_zero();
774 ELSIF token2 = gate_zero() THEN
775 -- X-0=X
776 monus_token:=token1;
777 ELSE
778 monus_token:=uuid_generate_v5(uuid_ns_provsql(),concat('monus',token1,token2));
779 PERFORM create_gate(monus_token, 'monus', ARRAY[token1::UUID, token2::UUID]);
780 END IF;
781
782 RETURN monus_token;
783END
784$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE;
785
786/**
787 * @brief Create a project gate for where-provenance tracking
788 *
789 * Records the mapping between input and output attribute positions.
790 *
791 * @param token child provenance token
792 * @param positions array encoding attribute position mappings
793 */
794CREATE OR REPLACE FUNCTION provenance_project(token UUID, VARIADIC positions INT[])
795 RETURNS UUID AS
796$$
797DECLARE
798 project_token UUID;
799 rec RECORD;
800BEGIN
801 project_token:=uuid_generate_v5(uuid_ns_provsql(),concat('project', token, positions));
802 PERFORM create_gate(project_token, 'project', ARRAY[token]);
803 PERFORM set_extra(project_token, ARRAY_AGG(pair)::TEXT)
804 FROM (
805 SELECT ARRAY[(CASE WHEN info=0 THEN NULL ELSE info END), idx] AS pair
806 FROM unnest(positions) WITH ORDINALITY AS a(info, idx)
807 ORDER BY idx
808 ) t;
809
810 RETURN project_token;
811END
812$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE;
813
814/**
815 * @brief Create an equijoin gate for where-provenance tracking
816 *
817 * @param token child provenance token
818 * @param pos1 attribute index in the first relation
819 * @param pos2 attribute index in the second relation
820 */
821CREATE OR REPLACE FUNCTION provenance_eq(token UUID, pos1 INT, pos2 INT)
822 RETURNS UUID AS
823$$
824DECLARE
825 eq_token UUID;
826 rec RECORD;
827BEGIN
828 eq_token:=uuid_generate_v5(uuid_ns_provsql(),concat('eq',token,pos1,',',pos2));
829
830 PERFORM create_gate(eq_token, 'eq', ARRAY[token::UUID]);
831 PERFORM set_infos(eq_token, pos1, pos2);
832 RETURN eq_token;
833END
834$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE;
835
836/**
837 * @brief Create a plus (sum) gate from an array of provenance tokens
838 *
839 * Filters out NULL and zero-gates; returns gate_zero() if all tokens
840 * are trivial, or a single token if only one remains.
841 */
842CREATE OR REPLACE FUNCTION provenance_plus(tokens UUID[])
843 RETURNS UUID AS
844$$
845DECLARE
846 c INTEGER;
847 plus_token UUID;
848 filtered_tokens UUID[];
849BEGIN
850 SELECT array_agg(t) FROM unnest(tokens) t
851 WHERE t IS NOT NULL AND t <> gate_zero()
852 INTO filtered_tokens;
853
854 c:=array_length(filtered_tokens, 1);
855
856 IF c = 0 THEN
857 plus_token := gate_zero();
858 ELSIF c = 1 THEN
859 plus_token := filtered_tokens[1];
860 ELSE
861 plus_token := uuid_generate_v5(
862 uuid_ns_provsql(),
863 concat('plus', filtered_tokens));
865 PERFORM create_gate(plus_token, 'plus', filtered_tokens);
866 END IF;
867
868 RETURN plus_token;
869END
870$$ LANGUAGE plpgsql STRICT SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE;
871
872/**
873 * @brief Create a comparison gate for HAVING clause provenance
874 *
875 * @param left_token provenance token for the left operand
876 * @param comparison_op OID of the comparison operator
877 * @param right_token provenance token for the right operand
878 */
879CREATE OR REPLACE FUNCTION provenance_cmp(
880 left_token UUID,
881 comparison_op OID,
882 right_token UUID
883)
884RETURNS UUID AS
885$$
886DECLARE
887 cmp_token UUID;
888BEGIN
889 -- deterministic v5 namespace id
890 cmp_token := public.uuid_generate_v5(
891 uuid_ns_provsql(),
892 concat('cmp', left_token::TEXT, comparison_op::TEXT, right_token::TEXT)
893 );
894 -- wire it up in the circuit
895 PERFORM create_gate(cmp_token, 'cmp', ARRAY[left_token, right_token]);
896 PERFORM set_infos(cmp_token, comparison_op::INTEGER);
897 RETURN cmp_token;
898END
899$$ LANGUAGE plpgsql
900 SET search_path=provsql,pg_temp,public
901 SECURITY DEFINER
902 IMMUTABLE
903 PARALLEL SAFE
904 STRICT;
905
906/**
907 * @brief Create an arithmetic gate over scalar-valued provenance children
908 *
909 * Builds a deterministic @c gate_arith from an operator tag and an
910 * ordered list of children. The tag is one of the @c provsql_arith_op
911 * ENUM values declared in @c src/provsql_utils.h
912 * (@c PLUS=0, @c TIMES=1, @c MINUS=2, @c DIV=3, @c NEG=4) and is
913 * stored in the gate's @c info1 field. Children must be UUIDs of
914 * scalar-producing gates (@c gate_rv, @c gate_value, or another
915 * @c gate_arith). The token UUID is derived deterministically from
916 * @p op and @p children so identical sub-expressions share their gate.
918 * @param op Operator tag (@c provsql_arith_op).
919 * @param children Ordered list of child gate UUIDs.
920 * @return UUID of the (possibly pre-existing) @c gate_arith.
921 */
922CREATE OR REPLACE FUNCTION provenance_arith(
923 op INTEGER,
924 children UUID[]
925)
926RETURNS UUID AS
927$$
928DECLARE
929 arith_token UUID;
930BEGIN
931 arith_token := public.uuid_generate_v5(
932 uuid_ns_provsql(),
933 concat('arith', op::TEXT, children::TEXT)
934 );
935 PERFORM create_gate(arith_token, 'arith', children);
936 PERFORM set_infos(arith_token, op);
937 RETURN arith_token;
938END
939$$ LANGUAGE plpgsql
940 SET search_path=provsql,pg_temp,public
941 SECURITY DEFINER
942 IMMUTABLE
943 PARALLEL SAFE
944 STRICT;
945
946/** @} */
947
948/** @defgroup semiring_evaluation Semiring evaluation
949 * Functions for evaluating provenance circuits over semirings,
950 * both user-defined (via function references) and compiled (built-in).
951 * @{
952 */
953
954/**
955 * @brief Evaluate provenance using a compiled (built-in) semiring
956 *
957 * This C function handles semiring evaluation entirely in C++ for
958 * better performance. The semiring is specified by name.
960 * @param token provenance token to evaluate
961 * @param token2value mapping table from tokens to semiring values
962 * @param semiring name of the compiled semiring (e.g., "formula", "counting")
963 * @param element_one identity element of the semiring
964 */
965CREATE OR REPLACE FUNCTION provenance_evaluate_compiled(
966 token UUID,
967 token2value REGCLASS,
968 semiring TEXT,
969 element_one ANYELEMENT)
970RETURNS ANYELEMENT AS
971 'provsql', 'provenance_evaluate_compiled' LANGUAGE C PARALLEL SAFE STABLE;
972
973
974/**
975 * @brief Evaluate provenance over a user-defined semiring (PL/pgSQL version)
976 *
977 * Recursively walks the provenance circuit and evaluates each gate
978 * using the provided semiring operations. This is the generic version
979 * that accepts semiring operations as function references.
980 *
981 * @param token provenance token to evaluate
982 * @param token2value mapping table from tokens to semiring values
983 * @param element_one identity element of the semiring
984 * @param value_type OID of the semiring value type
985 * @param plus_function semiring addition (aggregate)
986 * @param times_function semiring multiplication (aggregate)
987 * @param monus_function semiring monus (binary), or NULL
988 * @param delta_function δ-semiring operator, or NULL
989 */
990CREATE OR REPLACE FUNCTION provenance_evaluate(
991 token UUID,
992 token2value REGCLASS,
993 element_one ANYELEMENT,
994 value_type REGTYPE,
995 plus_function REGPROC,
996 times_function REGPROC,
997 monus_function REGPROC,
998 delta_function REGPROC)
999 RETURNS ANYELEMENT AS
1000$$
1001DECLARE
1002 gate_type PROVENANCE_GATE;
1003 result ALIAS FOR $0;
1004 children UUID[];
1005-- cmp_value ANYELEMENT;
1006-- temp_result ANYELEMENT;
1007 value_text TEXT;
1008BEGIN
1009 SELECT get_gate_type(token) INTO gate_type;
1010
1011 IF gate_type IS NULL THEN
1012 RETURN NULL;
1013
1014 ELSIF gate_type = 'input' THEN
1015 EXECUTE format('SELECT value FROM %s WHERE provenance=%L', token2value, token)
1016 INTO result;
1017 IF result IS NULL THEN
1018 result := element_one;
1019 END IF;
1020 ELSIF gate_type = 'mulinput' THEN
1021 SELECT concat('{',(get_children(token))[1]::TEXT,'=',(get_infos(token)).info1,'}')
1022 INTO result;
1023 ELSIF gate_type='update' THEN
1024 EXECUTE format('SELECT value FROM %s WHERE provenance=%L',token2value,token) INTO result;
1025 IF result IS NULL THEN
1026 result:=element_one;
1027 END IF;
1028 ELSIF gate_type = 'plus' THEN
1029 EXECUTE format('SELECT %s(provsql.provenance_evaluate(t,%L,%L::%s,%L,%L,%L,%L,%L)) FROM unnest(get_children(%L)) AS t',
1030 plus_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token)
1031 INTO result;
1032
1033 ELSIF gate_type = 'times' THEN
1034 EXECUTE format('SELECT %s(provsql.provenance_evaluate(t,%L,%L::%s,%L,%L,%L,%L,%L)) FROM unnest(get_children(%L)) AS t',
1035 times_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token)
1036 INTO result;
1037
1038 ELSIF gate_type = 'monus' THEN
1039 IF monus_function IS NULL THEN
1040 RAISE EXCEPTION USING MESSAGE='Provenance with negation evaluated over a semiring without monus function';
1041 ELSE
1042 EXECUTE format('SELECT %s(a1,a2) FROM (SELECT provsql.provenance_evaluate(c[1],%L,%L::%s,%L,%L,%L,%L,%L) AS a1, ' ||
1043 'provsql.provenance_evaluate(c[2],%L,%L::%s,%L,%L,%L,%L,%L) AS a2 FROM get_children(%L) c) tmp',
1044 monus_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function,
1045 token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token)
1046 INTO result;
1047 END IF;
1048
1049 ELSIF gate_type = 'eq' THEN
1050 EXECUTE format('SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L)',
1051 token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function)
1052 INTO result;
1053
1054/* elsif gate_type = 'cmp' then
1055
1056 EXECUTE format('SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L)',
1057 token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function)
1058 INTO temp_result;
1060 EXECUTE format('SELECT get_extra((get_children(%L))[2])', token)
1061 INTO cmp_value;
1062
1063 IF temp_result::TEXT = cmp_value::TEXT THEN
1064 SELECT concat('{',temp_result::TEXT,'=',cmp_value::TEXT,'}')
1065 INTO result;
1066 ELSE
1067 RETURN gate_zero()
1068 */
1069
1071
1072 ELSIF gate_type = 'delta' THEN
1073 IF delta_function IS NULL THEN
1074 RAISE EXCEPTION USING MESSAGE='Provenance with aggregation evaluated over a semiring without delta function';
1075 ELSE
1076 EXECUTE format('SELECT %I(a) FROM (SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L) AS a) tmp',
1077 delta_function, token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function)
1078 INTO result;
1079 END IF;
1080
1081 ELSIF gate_type = 'zero' THEN
1082 EXECUTE format('SELECT %I(a) FROM (SELECT %L::%I AS a WHERE FALSE) temp', plus_function, element_one, value_type)
1083 INTO result;
1084
1085 ELSIF gate_type = 'one' THEN
1086 EXECUTE format('SELECT %L::%I', element_one, value_type)
1087 INTO result;
1088
1089 ELSIF gate_type = 'project' THEN
1090 EXECUTE format('SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L)',
1091 token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function)
1092 INTO result;
1093
1094 ELSE
1095 RAISE EXCEPTION USING MESSAGE='provenance_evaluate cannot be called on formulas using ' || gate_type || ' gates; use compiled semirings instead';
1096 END IF;
1097
1098 RETURN result;
1099END
1100$$ LANGUAGE plpgsql PARALLEL SAFE STABLE;
1101
1102
1104 * @brief Evaluate aggregate provenance over a user-defined semiring (PL/pgSQL version)
1105 *
1106 * Handles agg and semimod gates produced by GROUP BY queries.
1107 *
1108 * @param token provenance token to evaluate
1109 * @param token2value mapping table from tokens to semiring values
1110 * @param agg_function_final finalization function for the aggregate
1111 * @param agg_function aggregate combination function
1112 * @param semimod_function semimodule scalar multiplication function
1113 * @param element_one identity element of the semiring
1114 * @param value_type OID of the semiring value type
1115 * @param plus_function semiring addition
1116 * @param times_function semiring multiplication
1117 * @param monus_function semiring monus, or NULL
1118 * @param delta_function δ-semiring operator, or NULL
1119 */
1120CREATE OR REPLACE FUNCTION aggregation_evaluate(
1121 token UUID,
1122 token2value REGCLASS,
1123 agg_function_final REGPROC,
1124 agg_function REGPROC,
1125 semimod_function REGPROC,
1126 element_one ANYELEMENT,
1127 value_type REGTYPE,
1128 plus_function REGPROC,
1129 times_function REGPROC,
1130 monus_function REGPROC,
1131 delta_function REGPROC)
1132 RETURNS ANYELEMENT AS
1133$$
1134DECLARE
1135 gt PROVENANCE_GATE;
1136 result ALIAS FOR $0;
1137BEGIN
1138 SELECT get_gate_type(token) INTO gt;
1139
1140 IF gt IS NULL THEN
1141 RETURN NULL;
1142 ELSIF gt='agg' THEN
1143 EXECUTE format('SELECT %I(%I(provsql.aggregation_evaluate(t,%L,%L,%L,%L,%L::%s,%L,%L,%L,%L,%L)),pp.proname::varchar) FROM
1144 unnest(get_children(%L)) AS t, pg_proc pp
1145 WHERE pp.oid=(get_infos(%L)).info1
1146 GROUP BY pp.proname',
1147 agg_function_final, agg_function,token2value,agg_function_final,agg_function,semimod_function,element_one,value_type,value_type,plus_function,times_function,
1148 monus_function,delta_function,token,token)
1149 INTO result;
1150 ELSE
1151 -- gt='semimod'
1152 EXECUTE format('SELECT %I(get_extra((get_children(%L))[2]),provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L))',
1153 semimod_function,token,token,token2value,element_one,value_type,value_type,plus_function,times_function,monus_function,delta_function)
1154 INTO result;
1155 END IF;
1156 RETURN result;
1157END
1158$$ LANGUAGE plpgsql PARALLEL SAFE STABLE;
1159
1160/**
1161 * @brief Evaluate provenance over a user-defined semiring (C version)
1163 * Optimized C implementation of provenance_evaluate. Infers the
1164 * value type from element_one. Monus and delta functions are optional.
1165 *
1166 * @param token provenance token to evaluate
1167 * @param token2value mapping table from tokens to semiring values
1168 * @param element_one identity element of the semiring
1169 * @param plus_function semiring addition (aggregate)
1170 * @param times_function semiring multiplication (aggregate)
1171 * @param monus_function semiring monus, or NULL if not needed
1172 * @param delta_function δ-semiring operator, or NULL if not needed
1173 */
1174CREATE OR REPLACE FUNCTION provenance_evaluate(
1175 token UUID,
1176 token2value REGCLASS,
1177 element_one ANYELEMENT,
1178 plus_function REGPROC,
1179 times_function REGPROC,
1180 monus_function REGPROC = NULL,
1181 delta_function REGPROC = NULL)
1182 RETURNS ANYELEMENT AS
1183 'provsql','provenance_evaluate' LANGUAGE C STABLE;
1184
1185/** @brief Evaluate aggregate provenance over a user-defined semiring (C version) */
1186CREATE OR REPLACE FUNCTION aggregation_evaluate(
1187 token UUID,
1188 token2value REGCLASS,
1189 agg_function_final REGPROC,
1190 agg_function REGPROC,
1191 semimod_function REGPROC,
1192 element_one ANYELEMENT,
1193 plus_function REGPROC,
1194 times_function REGPROC,
1195 monus_function REGPROC = NULL,
1196 delta_function REGPROC = NULL)
1197 RETURNS ANYELEMENT AS
1198 'provsql','aggregation_evaluate' LANGUAGE C STABLE;
1199
1200/** @} */
1201
1202/** @defgroup circuit_introspection Circuit introspection
1203 * Functions for examining the structure of provenance circuits,
1204 * used by visualization and where-provenance features.
1205 * @{
1206 */
1207
1208/** @brief Row type for sub_circuit_with_desc results */
1209CREATE TYPE GATE_WITH_DESC AS (f UUID, t UUID, gate_type PROVENANCE_GATE, desc_str CHARACTER VARYING, infos INTEGER[], extra TEXT);
1210
1211/**
1212 * @brief Return the sub-circuit reachable from a token, with descriptions
1213 *
1214 * Recursively traverses the provenance circuit from the given token and
1215 * returns all edges together with input gate descriptions from the
1216 * mapping table.
1218 * @param token root provenance token
1219 * @param token2desc mapping table providing descriptions for input gates
1220 */
1221CREATE OR REPLACE FUNCTION sub_circuit_with_desc(
1222 token UUID,
1223 token2desc REGCLASS) RETURNS SETOF GATE_WITH_DESC AS
1224$$
1225BEGIN
1226 RETURN QUERY EXECUTE
1227 'WITH RECURSIVE transitive_closure(f,t,gate_type) AS (
1228 SELECT $1,t,provsql.get_gate_type($1) FROM unnest(provsql.get_children($1)) AS t
1229 UNION ALL
1230 SELECT p1.t,u,provsql.get_gate_type(p1.t) FROM transitive_closure p1, unnest(provsql.get_children(p1.t)) AS u)
1231 SELECT *, ARRAY[(get_infos(f)).info1, (get_infos(f)).info2], get_extra(f) FROM (
1232 SELECT f::UUID,t::UUID,gate_type,NULL FROM transitive_closure
1233 UNION ALL
1234 SELECT p2.provenance::UUID as f, NULL::UUID, ''input'', CAST (p2.value AS varchar) FROM transitive_closure p1 JOIN ' || token2desc || ' AS p2
1235 ON p2.provenance=t
1236 UNION ALL
1237 SELECT provenance::UUID as f, NULL::UUID, ''input'', CAST (value AS varchar) FROM ' || token2desc || ' WHERE provenance=$1
1238 ) t'
1239 USING token LOOP;
1240 RETURN;
1241END
1242$$ LANGUAGE plpgsql PARALLEL SAFE;
1243
1244/**
1245 * @brief Identify which table and how many columns a provenance token belongs to
1246 *
1247 * Searches all provenance-tracked tables for a row matching the given
1248 * token and returns the table name and column count.
1249 *
1250 * @param token provenance token to look up
1251 * @param table_name (OUT) the table containing this token
1252 * @param nb_columns (OUT) number of non-provenance columns in that table
1253 */
1254CREATE OR REPLACE FUNCTION identify_token(
1255 token UUID, OUT table_name REGCLASS, OUT nb_columns INTEGER) AS
1256$$
1257DECLARE
1258 t RECORD;
1259 result RECORD;
1260BEGIN
1261 table_name:=NULL;
1262 nb_columns:=-1;
1263 FOR t IN
1264 SELECT relname,
1265 (SELECT count(*) FROM pg_attribute a2 WHERE a2.attrelid=a1.attrelid AND attnum>0 AND atttypid<>0)-1 c
1266 FROM pg_attribute a1 JOIN pg_type ON atttypid=pg_type.oid
1267 JOIN pg_class ON attrelid=pg_class.oid
1268 JOIN pg_namespace ON relnamespace=pg_namespace.oid
1269 WHERE typname='UUID' AND relkind='r'
1270 AND nspname<>'provsql'
1271 AND attname='provsql'
1272 LOOP
1273 EXECUTE format('SELECT * FROM %I WHERE provsql=%L',t.relname,token) INTO result;
1274 IF result IS NOT NULL THEN
1275 table_name:=t.relname;
1276 nb_columns:=t.c;
1277 EXIT;
1278 END IF;
1279 END LOOP;
1280END
1281$$ LANGUAGE plpgsql STRICT;
1282
1283/**
1284 * @brief Return the sub-circuit for where-provenance computation
1285 *
1286 * Similar to sub_circuit_with_desc but resolves input gates to their
1287 * source table and column count for where-provenance evaluation.
1288 */
1289CREATE OR REPLACE FUNCTION sub_circuit_for_where(token UUID)
1290 RETURNS TABLE(f UUID, t UUID, gate_type PROVENANCE_GATE, table_name REGCLASS, nb_columns INTEGER, infos INTEGER[], extra TEXT) AS
1291$$
1292 WITH RECURSIVE transitive_closure(f,t,idx,gate_type) AS (
1293 SELECT $1,t,id,provsql.get_gate_type($1) FROM unnest(provsql.get_children($1)) WITH ORDINALITY AS a(t,id)
1294 UNION ALL
1295 SELECT p1.t,u,id,provsql.get_gate_type(p1.t) FROM transitive_closure p1, unnest(provsql.get_children(p1.t)) WITH ORDINALITY AS a(u, id)
1296 ) SELECT f, t, gate_type, table_name, nb_columns, ARRAY[(get_infos(f)).info1, (get_infos(f)).info2], get_extra(f) FROM (
1297 SELECT f, t::UUID, idx, gate_type, NULL AS table_name, NULL AS nb_columns FROM transitive_closure
1298 UNION ALL
1299 SELECT DISTINCT t, NULL::UUID, NULL::INT, 'input'::PROVENANCE_GATE, (id).table_name, (id).nb_columns FROM transitive_closure JOIN (SELECT t AS prov, provsql.identify_token(t) as id FROM transitive_closure WHERE t NOT IN (SELECT f FROM transitive_closure)) temp ON t=prov
1300 UNION ALL
1301 SELECT DISTINCT $1, NULL::UUID, NULL::INT, 'input'::PROVENANCE_GATE, (id).table_name, (id).nb_columns FROM (SELECT provsql.identify_token($1) AS id WHERE $1 NOT IN (SELECT f FROM transitive_closure)) temp
1302 ) t
1303$$
1304LANGUAGE sql;
1305
1306/**
1307 * @brief BFS expansion of a provenance circuit, capped at @p max_depth
1308 *
1309 * Returns one row per (parent, child) edge in the BFS-bounded subgraph
1310 * rooted at @p root, plus one row for the root with <tt>parent</tt> and
1311 * <tt>child_pos</tt> NULL. Provenance circuits are DAGs, so a child gate
1312 * may have several parents within the bound; each such edge is reported
1313 * as a separate row, so callers must deduplicate on <tt>node</tt> if they
1314 * need a one-row-per-node view.
1315 *
1316 * <tt>depth</tt> is the node's BFS depth (its shortest distance from
1317 * @p root), so for an edge (parent, child) it is always the case that
1318 * <tt>parent.depth + 1 &gt;= child.depth</tt>; equality holds only on
1319 * shortest-path edges. A node at <tt>depth = max_depth</tt> is not
1320 * expanded; callers can detect a partial expansion by comparing
1321 * <tt>provsql.get_children</tt> length against the number of outgoing
1322 * edges reported.
1323 *
1324 * <tt>info1</tt> and <tt>info2</tt> are the INTEGER values stored on
1325 * the gate by <tt>provsql.set_infos</tt>, formatted as TEXT; their
1326 * meaning is gate-type-specific (see <tt>provsql.set_infos</tt>).
1327 *
1328 * @param root root provenance token
1329 * @param max_depth maximum BFS depth (default 8)
1330 */
1331CREATE OR REPLACE FUNCTION circuit_subgraph(root UUID, max_depth INT DEFAULT 8)
1332 RETURNS TABLE(node UUID, parent UUID, child_pos INT, gate_type TEXT, info1 TEXT, info2 TEXT, depth INT) AS
1333$$
1334 WITH RECURSIVE bfs(node, parent, child_pos, depth) AS (
1335 SELECT root, NULL::UUID, NULL::INT, 0
1336 UNION ALL
1337 SELECT c.t, b.node, c.idx::INT, b.depth + 1
1338 FROM bfs b
1339 CROSS JOIN LATERAL unnest(provsql.get_children(b.node))
1340 WITH ORDINALITY AS c(t, idx)
1341 WHERE b.depth < max_depth
1342 ),
1343 -- Each node's canonical depth is its shortest-path distance from the root.
1344 -- Tie-breaking on child_pos is irrelevant for the depth value but kept so
1345 -- the (now informational) row order is stable.
1346 node_depth AS (
1347 SELECT node, MIN(depth) AS depth FROM bfs GROUP BY node
1348 ),
1349 -- All distinct (parent, child, child_pos) triples seen during the BFS.
1350 -- A child reached from k parents within the bound contributes k rows.
1351 -- Self-joins (times(x, x)) contribute one row per child position.
1352 edges AS (
1353 SELECT DISTINCT parent, node AS child, child_pos
1354 FROM bfs WHERE parent IS NOT NULL
1355 )
1356 SELECT
1357 d.node,
1358 e.parent,
1359 e.child_pos,
1360 provsql.get_gate_type(d.node)::TEXT,
1361 i.info1::TEXT,
1362 i.info2::TEXT,
1363 d.depth
1364 FROM node_depth d
1365 LEFT JOIN edges e ON e.child = d.node
1366 LEFT JOIN LATERAL provsql.get_infos(d.node) i ON TRUE
1367 ORDER BY d.depth, d.node, e.parent;
1368$$ LANGUAGE sql STABLE PARALLEL SAFE;
1369
1370/**
1371 * @brief BFS subgraph of the IN-MEMORY simplified circuit rooted at @p root.
1372 *
1373 * Same row shape as @ref circuit_subgraph plus an inline @c extra
1374 * column, but built from the @c GenericCircuit returned by
1375 * @c getGenericCircuit -- i.e. AFTER @c provsql.simplify_on_load
1376 * passes (RangeCheck, ...) have rewritten any decidable @c gate_cmp
1377 * into Bernoulli @c gate_input / @c gate_zero / @c gate_one leaves.
1378 * Lets a renderer show the user what the evaluator actually sees,
1379 * without mutating the persisted DAG.
1380 *
1381 * Returns @c jsonb (an array of objects) rather than @c SETOF RECORD
1382 * to keep the C++ implementation free of SRF / @c FuncCallContext
1383 * boilerplate; callers either consume the array directly or expand
1384 * it via @c jsonb_array_elements.
1385 *
1386 * @param root Root provenance token.
1387 * @param max_depth Maximum BFS depth (default 8).
1388 */
1389CREATE OR REPLACE FUNCTION simplified_circuit_subgraph(
1390 root UUID, max_depth INT DEFAULT 8) RETURNS jsonb
1391 AS 'provsql','simplified_circuit_subgraph'
1392 LANGUAGE C STABLE PARALLEL SAFE;
1393
1394/**
1395 * @brief Empirical histogram of a scalar sub-circuit
1396 *
1397 * Returns a jsonb array of @c {bin_lo, bin_hi, count} objects covering
1398 * the observed @c [min, max] range of @p bins equal-width samples from
1399 * the sub-circuit rooted at @p token. Sample count is taken from
1400 * @c provsql.rv_mc_samples; pinning @c provsql.monte_carlo_seed makes
1401 * the result reproducible.
1402 *
1403 * Accepted root gate types are the scalar ones: @c gate_value (Dirac
1404 * at the constant, single bin), @c gate_rv (sampled from the leaf's
1405 * distribution), and @c gate_arith (sampled by recursing through the
1406 * arithmetic DAG, with shared @c gate_rv leaves correctly correlated
1407 * within an iteration). Any other gate type raises.
1408 *
1409 * @param token Root provenance token of a scalar sub-circuit.
1410 * @param bins Number of equal-width histogram bins (default 30).
1411 * @param prov Conditioning event (defaults to @c gate_one() = no
1412 * conditioning). When non-trivial, the histogram is
1413 * over the conditional distribution recovered by
1414 * rejection sampling on the joint circuit with @p token.
1415 */
1416CREATE OR REPLACE FUNCTION rv_histogram(
1417 token UUID, bins INT DEFAULT 30, prov UUID DEFAULT gate_one())
1418 RETURNS jsonb
1419 AS 'provsql','rv_histogram'
1420 LANGUAGE C VOLATILE PARALLEL SAFE;
1421
1422/**
1423 * @brief Sample the closed-form PDF and CDF of a (possibly truncated)
1424 * scalar distribution.
1425 *
1426 * Returns @c {"pdf": [{x, p}, ...], "cdf": [{x, p}, ...]} with @p samples
1427 * evenly-spaced points spanning the distribution's natural display
1428 * range (intersected with the conditioning event's interval when
1429 * @c prov is non-trivial). Used by ProvSQL Studio's Distribution
1430 * profile panel to overlay the analytical curve on the empirical
1431 * histogram from :sqlfunc:`rv_histogram` -- the simplifier's
1432 * analytical wins (e.g. @c c·Exp(λ) folding to @c Exp(λ/c)) become
1433 * visible as a smooth curve riding over the MC-sampled bars.
1434 *
1435 * Returns @c NULL when the root sub-circuit is not a closed-form
1436 * shape (V1: only bare @c gate_rv of Normal / Uniform / Exponential
1437 * / INTEGER-Erlang). The frontend reads @c NULL as "skip overlay"
1438 * without erroring, so the caller can dispatch this in parallel with
1439 * @c rv_histogram regardless of the underlying shape.
1440 *
1441 * @param token Scalar gate token (random_variable's UUID).
1442 * @param samples Number of (x, p) points; must be >= 2.
1443 * @param prov Conditioning event (defaults to @c gate_one() = no
1444 * conditioning). When non-trivial, the curves are
1445 * over the truncated distribution.
1446 */
1447CREATE OR REPLACE FUNCTION rv_analytical_curves(
1448 token UUID, samples INT DEFAULT 100, prov UUID DEFAULT gate_one())
1449 RETURNS jsonb
1450 AS 'provsql','rv_analytical_curves'
1451 LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1452
1453/**
1454 * @brief Draw conditional Monte Carlo samples from a scalar gate.
1455 *
1456 * Returns up to @c n samples of the scalar value at @c token; when
1457 * @c prov is not the trivial @c gate_one() event, draws are accepted
1458 * only on iterations where @c prov evaluates true (rejection
1459 * sampling). Shared @c gate_rv leaves between @c token and @c prov
1460 * are loaded into a single joint circuit so the indicator's draw
1461 * and the value's draw share their per-iteration state.
1462 *
1463 * @param token Scalar sub-circuit root.
1464 * @param n Number of accepted samples to attempt.
1465 * @param prov Conditioning event (defaults to @c gate_one() = no
1466 * conditioning).
1467 *
1468 * Emits a @c NOTICE when the conditional acceptance rate yields fewer
1469 * than @c n samples within the @c provsql.rv_mc_samples budget so the
1470 * caller can choose to widen the budget.
1471 */
1472CREATE OR REPLACE FUNCTION rv_sample(
1473 token UUID, n INTEGER, prov UUID DEFAULT gate_one())
1474 RETURNS SETOF float8
1475 AS 'provsql','rv_sample'
1476 LANGUAGE C VOLATILE PARALLEL SAFE;
1477
1478/**
1479 * @brief Resolve an input gate UUID back to its source row
1480 *
1481 * Searches every provenance-tracked relation for a row whose
1482 * <tt>provsql</tt> column equals @p UUID and returns the relation's
1483 * REGCLASS together with the row encoded as JSONB. Returns zero
1484 * rows when @p UUID is not the provenance token of any tracked row,
1485 * including when it identifies an internal gate (<tt>plus</tt>,
1486 * <tt>times</tt>, ...) rather than an input.
1487 *
1488 * Ordinarily exactly one row is returned, but if the same UUID
1489 * happens to appear as a <tt>provsql</tt> value in several tracked
1490 * tables, all matches are returned.
1491 *
1492 * @param UUID token to resolve
1493 */
1494CREATE OR REPLACE FUNCTION resolve_input(UUID UUID)
1495 RETURNS TABLE(relation REGCLASS, row_data JSONB) AS
1496$$
1497DECLARE
1498 t RECORD;
1499 rel REGCLASS;
1500 rd JSONB;
1501 -- ProvSQL's rewriter unconditionally appends a provsql column to the
1502 -- targetlist of any SELECT reading from a tracked relation; capture and
1503 -- discard it here rather than disabling the rewriter for the whole call.
1504 ign UUID;
1505BEGIN
1506 FOR t IN
1507 SELECT c.oid::REGCLASS AS regc
1508 FROM pg_attribute a
1509 JOIN pg_class c ON a.attrelid = c.oid
1510 JOIN pg_namespace ns ON c.relnamespace = ns.oid
1511 JOIN pg_type ty ON a.atttypid = ty.oid
1512 WHERE a.attname = 'provsql'
1513 AND ty.typname = 'UUID'
1514 AND c.relkind = 'r'
1515 AND ns.nspname <> 'provsql'
1516 AND a.attnum > 0
1517 LOOP
1518 FOR rel, rd, ign IN
1519 EXECUTE format(
1520 'SELECT %L::REGCLASS, to_jsonb(t) - ''provsql'', t.provsql FROM %s AS t WHERE provsql = $1',
1521 t.regc, t.regc)
1522 USING UUID
1523 LOOP
1524 relation := rel;
1525 row_data := rd;
1526 RETURN NEXT;
1527 END LOOP;
1528 END LOOP;
1529END
1530$$ LANGUAGE plpgsql STABLE;
1532/** @} */
1533
1534/** @defgroup agg_token_type Type for the result of aggregate queries
1535 *
1536 * Custom type <tt>AGG_TOKEN</tt> for a provenance semimodule value, to
1537 * be used in attributes that are computed as a result of aggregation.
1538 * As for provenance tokens, this is simply a UUID, but this UUID is
1539 * displayed in a specific way (as the result of the aggregation
1540 * followed by a "(*)") to help with readability.
1541 *
1542 * The TEXT output is controlled by the
1543 * <tt>provsql.aggtoken_text_as_uuid</tt> GUC. By default it is off and
1544 * the cell renders as <tt>"value (*)"</tt>. When set to on (typical
1545 * for UI layers such as ProvSQL Studio), the cell renders as the
1546 * underlying UUID instead, so the caller can click through to the
1547 * provenance circuit; the value side is then recovered via
1548 * <tt>provsql.agg_token_value_text(UUID)</tt>.
1549 *
1550 * @{
1551 */
1552
1553CREATE TYPE AGG_TOKEN;
1554
1555/** @brief Input function for the AGG_TOKEN type (parses TEXT representation) */
1556CREATE OR REPLACE FUNCTION agg_token_in(CSTRING)
1557 RETURNS AGG_TOKEN
1558 AS 'provsql','agg_token_in' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1559
1560/**
1561 * @brief Output function for the AGG_TOKEN type
1562 *
1563 * Default: produces the human-friendly @c "value (*)" form, where
1564 * @c value is the running aggregate state.
1565 *
1566 * When the @c provsql.aggtoken_text_as_uuid GUC is on, returns the
1567 * underlying provenance UUID instead. UI layers (notably ProvSQL
1568 * Studio) flip this on per session so aggregate cells expose the
1569 * circuit root UUID for click-through; the @c "value (*)" display
1570 * string is recovered via @c provsql.agg_token_value_text(UUID).
1571 *
1572 * Marked STABLE rather than IMMUTABLE because the chosen output
1573 * shape now depends on a GUC that the same session can flip at
1574 * runtime.
1575 */
1576CREATE OR REPLACE FUNCTION agg_token_out(AGG_TOKEN)
1577 RETURNS CSTRING
1578 AS 'provsql','agg_token_out' LANGUAGE C STABLE STRICT PARALLEL SAFE;
1579
1580/** @brief Cast an AGG_TOKEN to its TEXT representation */
1581CREATE OR REPLACE FUNCTION agg_token_cast(AGG_TOKEN)
1582 RETURNS TEXT
1583 AS 'provsql','agg_token_cast' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1584
1585CREATE TYPE AGG_TOKEN (
1586 internallength = 117,
1587 input = agg_token_in,
1588 output = agg_token_out,
1589 alignment = char
1591
1592/** @brief Extract the UUID from an AGG_TOKEN (implicit cast to UUID) */
1593CREATE OR REPLACE FUNCTION agg_token_uuid(aggtok AGG_TOKEN)
1594 RETURNS UUID AS
1595$$
1596BEGIN
1597 RETURN agg_token_cast(aggtok)::UUID;
1599$$ LANGUAGE plpgsql STRICT SET search_path=provsql,pg_temp,public SECURITY DEFINER IMMUTABLE PARALLEL SAFE;
1600
1601/** @brief Implicit PostgreSQL cast from AGG_TOKEN to UUID (delegates to agg_token_uuid()) */
1602CREATE CAST (AGG_TOKEN AS UUID) WITH FUNCTION agg_token_uuid(AGG_TOKEN) AS IMPLICIT;
1603
1604/**
1605 * @brief Recover the @c "value (*)" display string for an aggregation gate
1606 *
1607 * Companion helper to the @c provsql.aggtoken_text_as_uuid GUC. With
1608 * the GUC on, an @c AGG_TOKEN cell prints as the underlying provenance
1609 * UUID, which is convenient for tooling that wants to click through to
1610 * the circuit but loses the human-readable aggregate value. This
1611 * function takes such a UUID and returns the original @c "value (*)"
1612 * string by reading the gate's @c extra (set by aggregate evaluation
1613 * to the computed scalar). Returns @c NULL if @p token does not
1614 * resolve to an @c agg gate.
1615 *
1616 * @param token UUID of an @c agg gate (typically obtained from an
1617 * @c AGG_TOKEN cell when @c aggtoken_text_as_uuid is on,
1618 * or via a manual UUID cast otherwise).
1619 */
1620CREATE OR REPLACE FUNCTION agg_token_value_text(token UUID)
1621 RETURNS TEXT AS
1622$$
1623 SELECT CASE
1624 WHEN provsql.get_gate_type(token) = 'agg'
1625 THEN provsql.get_extra(token) || ' (*)'
1626 ELSE NULL
1627 END;
1628$$ LANGUAGE sql STABLE STRICT PARALLEL SAFE;
1630/** @brief Cast an AGG_TOKEN to NUMERIC (extracts the aggregate value, loses provenance) */
1631CREATE OR REPLACE FUNCTION agg_token_to_numeric(AGG_TOKEN)
1632 RETURNS NUMERIC
1633 AS 'provsql','agg_token_to_numeric' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1634
1635/** @brief Cast an AGG_TOKEN to double precision (extracts the aggregate value, loses provenance) */
1636CREATE OR REPLACE FUNCTION agg_token_to_float8(AGG_TOKEN)
1637 RETURNS double precision
1638 AS 'provsql','agg_token_to_float8' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1639
1640/** @brief Cast an AGG_TOKEN to INTEGER (extracts the aggregate value, loses provenance) */
1641CREATE OR REPLACE FUNCTION agg_token_to_int4(AGG_TOKEN)
1642 RETURNS INTEGER
1643 AS 'provsql','agg_token_to_int4' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1644
1645/** @brief Cast an AGG_TOKEN to bigint (extracts the aggregate value, loses provenance) */
1646CREATE OR REPLACE FUNCTION agg_token_to_int8(AGG_TOKEN)
1647 RETURNS bigint
1648 AS 'provsql','agg_token_to_int8' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1650/** @brief Cast an AGG_TOKEN to TEXT (extracts the aggregate value, loses provenance) */
1651CREATE OR REPLACE FUNCTION agg_token_to_text(AGG_TOKEN)
1652 RETURNS TEXT
1653 AS 'provsql','agg_token_to_text' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1654
1655/** @brief Implicit PostgreSQL cast from AGG_TOKEN to NUMERIC (enables arithmetic on aggregates) */
1656CREATE CAST (AGG_TOKEN AS NUMERIC) WITH FUNCTION agg_token_to_numeric(AGG_TOKEN) AS IMPLICIT;
1657/** @brief Assignment cast from AGG_TOKEN to double precision */
1658CREATE CAST (AGG_TOKEN AS double precision) WITH FUNCTION agg_token_to_float8(AGG_TOKEN) AS ASSIGNMENT;
1659/** @brief Assignment cast from AGG_TOKEN to INTEGER */
1660CREATE CAST (AGG_TOKEN AS INTEGER) WITH FUNCTION agg_token_to_int4(AGG_TOKEN) AS ASSIGNMENT;
1661/** @brief Assignment cast from AGG_TOKEN to bigint */
1662CREATE CAST (AGG_TOKEN AS bigint) WITH FUNCTION agg_token_to_int8(AGG_TOKEN) AS ASSIGNMENT;
1663/** @brief Assignment cast from AGG_TOKEN to TEXT (extracts value, not UUID) */
1664CREATE CAST (AGG_TOKEN AS TEXT) WITH FUNCTION agg_token_to_text(AGG_TOKEN) AS ASSIGNMENT;
1666/**
1667 * @brief Placeholder comparison of AGG_TOKEN with NUMERIC
1668 *
1669 * This function is never actually called; it exists so the SQL parser
1670 * accepts comparison operators between AGG_TOKEN and NUMERIC values.
1671 * The ProvSQL query rewriter replaces these comparisons at plan time.
1672 */
1673CREATE OR REPLACE FUNCTION agg_token_comp_numeric(a AGG_TOKEN, b NUMERIC)
1674RETURNS BOOLEAN
1675LANGUAGE plpgsql
1676IMMUTABLE STRICT PARALLEL SAFE
1677AS $$
1678BEGIN
1679 RAISE EXCEPTION 'Comparison AGG_TOKEN-NUMERIC not implemented, should be replaced by ProvSQL behavior';
1680END;
1681$$;
1682
1683/**
1684 * @brief Placeholder comparison of NUMERIC with AGG_TOKEN
1685 *
1686 * Symmetric to agg_token_comp_numeric; never actually called.
1687 * The ProvSQL query rewriter replaces these comparisons at plan time.
1688 */
1689CREATE OR REPLACE FUNCTION numeric_comp_agg_token(a NUMERIC, b AGG_TOKEN)
1690RETURNS BOOLEAN
1691LANGUAGE plpgsql
1692IMMUTABLE STRICT PARALLEL SAFE
1693AS $$
1694BEGIN
1695 RAISE EXCEPTION 'Comparison NUMERIC-AGG_TOKEN not implemented, should be replaced by ProvSQL behavior';
1696END;
1697$$;
1698
1699/** @brief SQL operator AGG_TOKEN < NUMERIC (placeholder rewritten by ProvSQL at plan time) */
1700CREATE OPERATOR < (
1701 LEFTARG = AGG_TOKEN,
1702 RIGHTARG = NUMERIC,
1703 PROCEDURE = agg_token_comp_numeric,
1704 COMMUTATOR = >,
1705 NEGATOR = >=
1706);
1707/** @brief SQL operator NUMERIC < AGG_TOKEN (placeholder rewritten by ProvSQL at plan time) */
1708CREATE OPERATOR < (
1709 LEFTARG = NUMERIC,
1710 RIGHTARG = AGG_TOKEN,
1711 PROCEDURE = numeric_comp_agg_token,
1712 COMMUTATOR = >,
1713 NEGATOR = >=
1714);
1715
1716/** @brief SQL operator AGG_TOKEN <= NUMERIC (placeholder rewritten by ProvSQL at plan time) */
1717CREATE OPERATOR <= (
1718 LEFTARG = AGG_TOKEN,
1719 RIGHTARG = NUMERIC,
1720 PROCEDURE = agg_token_comp_numeric,
1721 COMMUTATOR = >=,
1722 NEGATOR = >
1723);
1724/** @brief SQL operator NUMERIC <= AGG_TOKEN (placeholder rewritten by ProvSQL at plan time) */
1725CREATE OPERATOR <= (
1726 LEFTARG = NUMERIC,
1727 RIGHTARG = AGG_TOKEN,
1728 PROCEDURE = numeric_comp_agg_token,
1729 COMMUTATOR = >=,
1730 NEGATOR = >
1731);
1732
1733/** @brief SQL operator AGG_TOKEN = NUMERIC (placeholder rewritten by ProvSQL at plan time) */
1734CREATE OPERATOR = (
1735 LEFTARG = AGG_TOKEN,
1736 RIGHTARG = NUMERIC,
1737 PROCEDURE = agg_token_comp_numeric,
1738 COMMUTATOR = =,
1739 NEGATOR = <>
1740);
1741/** @brief SQL operator NUMERIC = AGG_TOKEN (placeholder rewritten by ProvSQL at plan time) */
1742CREATE OPERATOR = (
1743 LEFTARG = NUMERIC,
1744 RIGHTARG = AGG_TOKEN,
1745 PROCEDURE = numeric_comp_agg_token,
1746 COMMUTATOR = =,
1747 NEGATOR = <>
1748);
1749
1750/** @brief SQL operator AGG_TOKEN <> NUMERIC (placeholder rewritten by ProvSQL at plan time) */
1751CREATE OPERATOR <> (
1752 LEFTARG = AGG_TOKEN,
1753 RIGHTARG = NUMERIC,
1754 PROCEDURE = agg_token_comp_numeric,
1755 COMMUTATOR = <>,
1756 NEGATOR = =
1757);
1758/** @brief SQL operator NUMERIC <> AGG_TOKEN (placeholder rewritten by ProvSQL at plan time) */
1759CREATE OPERATOR <> (
1760 LEFTARG = NUMERIC,
1761 RIGHTARG = AGG_TOKEN,
1762 PROCEDURE = numeric_comp_agg_token,
1763 COMMUTATOR = <>,
1764 NEGATOR = =
1765);
1766
1767/** @brief SQL operator AGG_TOKEN >= NUMERIC (placeholder rewritten by ProvSQL at plan time) */
1768CREATE OPERATOR >= (
1769 LEFTARG = AGG_TOKEN,
1770 RIGHTARG = NUMERIC,
1771 PROCEDURE = agg_token_comp_numeric,
1772 COMMUTATOR = <=,
1773 NEGATOR = <
1774);
1775/** @brief SQL operator NUMERIC >= AGG_TOKEN (placeholder rewritten by ProvSQL at plan time) */
1776CREATE OPERATOR >= (
1777 LEFTARG = NUMERIC,
1778 RIGHTARG = AGG_TOKEN,
1779 PROCEDURE = numeric_comp_agg_token,
1780 COMMUTATOR = <=,
1781 NEGATOR = <
1782);
1783
1784/** @brief SQL operator AGG_TOKEN > NUMERIC (placeholder rewritten by ProvSQL at plan time) */
1785CREATE OPERATOR > (
1786 LEFTARG = AGG_TOKEN,
1787 RIGHTARG = NUMERIC,
1788 PROCEDURE = agg_token_comp_numeric,
1789 COMMUTATOR = <,
1790 NEGATOR = <=
1791);
1792/** @brief SQL operator NUMERIC > AGG_TOKEN (placeholder rewritten by ProvSQL at plan time) */
1793CREATE OPERATOR > (
1794 LEFTARG = NUMERIC,
1795 RIGHTARG = AGG_TOKEN,
1796 PROCEDURE = numeric_comp_agg_token,
1797 COMMUTATOR = <,
1798 NEGATOR = <=
1799);
1800
1801/** @} */
1803/** @defgroup random_variable_type Type for continuous random variables
1804 *
1805 * Custom type <tt>random_variable</tt>: a thin wrapper around a
1806 * provenance gate UUID, used to expose continuous probabilistic
1807 * c-tables in SQL. The UUID indexes either a <tt>gate_rv</tt>
1808 * (an actual distribution) or a <tt>gate_value</tt> (a
1809 * zero-variance constant produced by <tt>provsql.as_random</tt>).
1810 * Binary-coercible with <tt>UUID</tt> (same 16-byte layout), so an
1811 * <tt>rv</tt>-typed expression flows directly into any function
1812 * expecting a UUID at zero runtime cost.
1813 *
1814 * Constructors live in this group: <tt>provsql.normal(μ, σ)</tt>,
1815 * <tt>provsql.uniform(a, b)</tt>, <tt>provsql.exponential(λ)</tt>,
1816 * <tt>provsql.erlang(k, λ)</tt>, and <tt>provsql.as_random(c)</tt>.
1817 * Operator overloads
1818 * (<tt>+ - * /</tt> and the six comparators) are defined further
1819 * below, alongside direct <tt>rv_cmp_*</tt> UUID constructors for
1820 * callers that want a <tt>gate_cmp</tt> token without going through
1821 * the planner hook.
1822 * @{
1823 */
1824
1825CREATE TYPE random_variable;
1826
1827/** @brief Input function for the random_variable type */
1828CREATE OR REPLACE FUNCTION random_variable_in(CSTRING)
1829 RETURNS random_variable
1830 AS 'provsql','random_variable_in' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1831
1832/** @brief Output function for the random_variable type */
1833CREATE OR REPLACE FUNCTION random_variable_out(random_variable)
1834 RETURNS CSTRING
1835 AS 'provsql','random_variable_out' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1836
1837CREATE TYPE random_variable (
1838 internallength = 16,
1839 input = random_variable_in,
1840 output = random_variable_out,
1841 alignment = char
1842);
1843
1844/** @brief Build a random_variable from a UUID (internal). */
1845CREATE OR REPLACE FUNCTION random_variable_make(tok UUID)
1846 RETURNS random_variable
1847 AS 'provsql','random_variable_make' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
1848
1849/** @brief Binary-coercible cast random_variable -> UUID.
1850 * A random_variable is byte-for-byte a pg_uuid_t (alignment char,
1851 * length 16), so WITHOUT FUNCTION lets PostgreSQL reinterpret the
1852 * bytes at zero runtime cost. The cast is IMPLICIT so a `provsql`
1853 * column or any random_variable expression flows directly into any
1854 * function expecting a UUID. */
1855CREATE CAST (random_variable AS UUID) WITHOUT FUNCTION AS IMPLICIT;
1856CREATE CAST (UUID AS random_variable) WITHOUT FUNCTION;
1857
1858/**
1859 * @brief Internal: true iff @p x is a finite (non-NaN, non-±∞) float8.
1860 *
1861 * PostgreSQL's <tt>isnan</tt> is defined for <tt>NUMERIC</tt> only,
1862 * not for <tt>double precision</tt>; we use the inequality form,
1863 * which works because PG defines <tt>NaN = NaN</tt> as <tt>TRUE</tt>
1864 * for floats (so <tt>NaN <> 'NaN'::float8</tt> is <tt>FALSE</tt>).
1865 */
1866CREATE OR REPLACE FUNCTION is_finite_float8(x double precision)
1867 RETURNS BOOL AS
1868$$
1869 SELECT $1 <> 'NaN'::float8 AND $1 <> 'Infinity'::float8 AND $1 <> '-Infinity'::float8;
1870$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
1871
1872/**
1873 * @brief Construct a normal-distribution random variable
1874 *
1875 * Creates a fresh <tt>gate_rv</tt> with @c "normal:μ,σ" stored in
1876 * the gate's @c extra field, and returns a <tt>random_variable</tt>
1877 * pointing at it.
1878 *
1879 * Validation:
1880 * - @p mu and @p sigma must be finite (no @c NaN, no @c ±Infinity).
1881 * - @p sigma must be non-negative.
1882 * - When @p sigma is zero the distribution degenerates to the Dirac
1883 * at @p mu; the call is silently routed through @c as_random(mu),
1884 * producing a @c gate_value rather than a zero-variance @c gate_rv.
1885 * This keeps the sampler / moment / boundcheck paths free of σ=0
1886 * special cases and lets <tt>normal(x, 0)</tt> share its gate with
1887 * <tt>as_random(x)</tt>.
1888 *
1889 * @warning The <tt>VOLATILE</tt> marking is load-bearing and must
1890 * not be weakened. Each call mints a fresh <tt>uuid_generate_v4</tt>
1891 * token because two calls to <tt>normal(0, 1)</tt> are *independent*
1892 * random variables; if PostgreSQL were allowed to fold the function
1893 * (which it would under <tt>STABLE</tt> / <tt>IMMUTABLE</tt>), two
1894 * calls in the same query would share a UUID and collapse into a
1895 * single dependent RV, silently breaking the c-table semantics.
1896 * Same warning applies to @c uniform and @c exponential below.
1897 *
1898 * @sa <a href="https://en.wikipedia.org/wiki/Normal_distribution">Wikipedia: Normal distribution</a>
1899 */
1900CREATE OR REPLACE FUNCTION normal(mu double precision, sigma double precision)
1901 RETURNS random_variable AS
1902$$
1903DECLARE
1904 token UUID;
1905BEGIN
1906 IF NOT provsql.is_finite_float8(mu) OR NOT provsql.is_finite_float8(sigma) THEN
1907 RAISE EXCEPTION 'provsql.normal: parameters must be finite (got mu=%, sigma=%)', mu, sigma;
1908 END IF;
1909 IF sigma < 0 THEN
1910 RAISE EXCEPTION 'provsql.normal: sigma must be non-negative (got %)', sigma;
1911 END IF;
1912 IF sigma = 0 THEN
1913 RETURN provsql.as_random(mu);
1914 END IF;
1915 token := public.uuid_generate_v4();
1916 PERFORM provsql.create_gate(token, 'rv');
1917 PERFORM provsql.set_extra(token, 'normal:' || mu || ',' || sigma);
1918 RETURN provsql.random_variable_make(token);
1919END
1920$$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE;
1922/**
1923 * @brief Construct a uniform-distribution random variable on [a, b]
1924 *
1925 * Validation:
1926 * - @p a and @p b must be finite.
1927 * - @p a must be ≤ @p b (reversed bounds are rejected).
1928 * - When <tt>a = b</tt> the distribution is the Dirac at @p a; the
1929 * call is silently routed through @c as_random(a) for the same
1930 * reason as @c normal with @p sigma = 0.
1931 *
1932 * @warning <tt>VOLATILE</tt> is load-bearing; see the warning on
1933 * @ref normal.
1934 *
1935 * @sa <a href="https://en.wikipedia.org/wiki/Continuous_uniform_distribution">Wikipedia: Continuous uniform distribution</a>
1936 */
1937CREATE OR REPLACE FUNCTION uniform(a double precision, b double precision)
1938 RETURNS random_variable AS
1939$$
1940DECLARE
1941 token UUID;
1942BEGIN
1943 IF NOT provsql.is_finite_float8(a) OR NOT provsql.is_finite_float8(b) THEN
1944 RAISE EXCEPTION 'provsql.uniform: bounds must be finite (got a=%, b=%)', a, b;
1945 END IF;
1946 IF a > b THEN
1947 RAISE EXCEPTION 'provsql.uniform: a must be <= b (got a=%, b=%)', a, b;
1948 END IF;
1949 IF a = b THEN
1950 RETURN provsql.as_random(a);
1951 END IF;
1952 token := public.uuid_generate_v4();
1953 PERFORM provsql.create_gate(token, 'rv');
1954 PERFORM provsql.set_extra(token, 'uniform:' || a || ',' || b);
1955 RETURN provsql.random_variable_make(token);
1956END
1957$$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE;
1958
1959/**
1960 * @brief Construct an exponential-distribution random variable with rate λ
1962 * Validation:
1963 * - @p lambda must be finite and strictly positive. No degenerate
1964 * form exists for the exponential distribution, so there is no
1965 * silent route through @c as_random.
1966 *
1967 * @warning <tt>VOLATILE</tt> is load-bearing; see the warning on
1968 * @ref normal.
1969 *
1970 * @sa <a href="https://en.wikipedia.org/wiki/Exponential_distribution">Wikipedia: Exponential distribution</a>
1971 */
1972CREATE OR REPLACE FUNCTION exponential(lambda double precision)
1973 RETURNS random_variable AS
1974$$
1975DECLARE
1976 token UUID;
1977BEGIN
1978 IF NOT provsql.is_finite_float8(lambda) THEN
1979 RAISE EXCEPTION 'provsql.exponential: lambda must be finite (got %)', lambda;
1980 END IF;
1981 IF lambda <= 0 THEN
1982 RAISE EXCEPTION 'provsql.exponential: lambda must be strictly positive (got %)', lambda;
1983 END IF;
1984 token := public.uuid_generate_v4();
1985 PERFORM provsql.create_gate(token, 'rv');
1986 PERFORM provsql.set_extra(token, 'exponential:' || lambda);
1987 RETURN provsql.random_variable_make(token);
1988END
1989$$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE;
1990
1992 * @brief Construct an Erlang-distribution random variable, sum of
1993 * @p k i.i.d. exponentials with shared rate @p lambda
1994 *
1995 * The Erlang distribution is the sum of @p k independent
1996 * <tt>Exp(λ)</tt> random variables (equivalently the gamma with
1997 * INTEGER shape). It is the natural closure of i.i.d.
1998 * exponentials under addition, and is materialised here as a single
1999 * <tt>gate_rv</tt> so the analytic CDF and closed-form moments fire
2000 * directly (rather than the sampler having to draw and sum @p k
2001 * exponential leaves per Monte-Carlo iteration).
2002 *
2003 * Validation:
2004 * - @p k must be ≥ 1. The degenerate @c k=1 case is silently routed
2005 * through @c exponential so <tt>erlang(1, λ)</tt> shares its gate
2006 * with <tt>exponential(λ)</tt>.
2007 * - @p lambda must be finite and strictly positive.
2008 *
2009 * @warning <tt>VOLATILE</tt> is load-bearing; see the warning on
2010 * @ref normal.
2011 *
2012 * @sa <a href="https://en.wikipedia.org/wiki/Erlang_distribution">Wikipedia: Erlang distribution</a>
2013 */
2014CREATE OR REPLACE FUNCTION erlang(k INTEGER, lambda double precision)
2015 RETURNS random_variable AS
2016$$
2017DECLARE
2018 token UUID;
2019BEGIN
2020 IF k < 1 THEN
2021 RAISE EXCEPTION 'provsql.erlang: k must be >= 1 (got %)', k;
2022 END IF;
2023 IF NOT provsql.is_finite_float8(lambda) THEN
2024 RAISE EXCEPTION 'provsql.erlang: lambda must be finite (got %)', lambda;
2025 END IF;
2026 IF lambda <= 0 THEN
2027 RAISE EXCEPTION 'provsql.erlang: lambda must be strictly positive (got %)', lambda;
2028 END IF;
2029 IF k = 1 THEN
2030 RETURN provsql.exponential(lambda);
2031 END IF;
2032 token := public.uuid_generate_v4();
2033 PERFORM provsql.create_gate(token, 'rv');
2034 PERFORM provsql.set_extra(token, 'erlang:' || k || ',' || lambda);
2035 RETURN provsql.random_variable_make(token);
2036END
2037$$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE;
2038
2039/**
2040 * @brief Construct a probabilistic-mixture random variable.
2041 *
2042 * Returns a @c random_variable whose distribution is a Bernoulli
2043 * mixture of two scalar RV roots: with probability <tt>P(p = true)</tt>
2044 * the mixture samples @p x, with the complementary probability it
2045 * samples @p y. The mixing token @p p is a @c gate_input Bernoulli
2046 * whose probability has been pinned with @c set_prob, and the same
2047 * @p p can be shared with other branches of the circuit -- the
2048 * Monte-Carlo sampler's per-iteration cache couples every reference
2049 * to the same draw, so users can build joint conditional structures
2050 * (e.g. <tt>mixture(p, X1, Y1) + mixture(p, X2, Y2)</tt> samples
2051 * X1 + X2 with prob π and Y1 + Y2 with prob 1-π).
2052 *
2053 * @p x and @p y may be any scalar RV root: a base @c gate_rv
2054 * (@c normal / @c uniform / @c exponential / @c erlang), a
2055 * @c gate_value Dirac (@c as_random), a @c gate_arith expression, or
2056 * another @c mixture. N-ary mixtures are built by composition --
2057 * <tt>mixture(p1, A, mixture(p2, B, C))</tt> realises a 3-component
2058 * mixture with effective weights <tt>π1, (1-π1)·π2, (1-π1)·(1-π2)</tt>.
2059 *
2060 * Validation:
2061 * - @p p must point to a Boolean gate (@c input, @c mulinput,
2062 * @c update, @c plus, @c times, @c monus, @c project, @c eq,
2063 * @c cmp, @c zero, @c one). Compound Boolean gates derive their
2064 * probability from their atoms via the active probability-evaluation
2065 * method; a bare @c gate_input's probability is whatever @c set_prob
2066 * pinned (@c set_prob is responsible for keeping it in [0, 1]).
2067 * - @p x and @p y must be scalar RV roots; aggregate / Boolean roots
2068 * are rejected at construction.
2069 *
2070 * Two calls to @c mixture with the same @c (p, x, y) operands collapse
2071 * to the same @c gate_mixture node by v5-hash, exactly like
2072 * @c arith(PLUS, X, Y). Draw independence is controlled by @p p:
2073 * sharing @p p couples branch selection across consumers via the
2074 * sampler's @c bool_cache_; minting independent Bernoullis (e.g. via
2075 * the @c mixture(p_value, …) overload) decouples them.
2076 *
2077 * @sa <a href="https://en.wikipedia.org/wiki/Mixture_distribution">Wikipedia: Mixture distribution</a>
2079CREATE OR REPLACE FUNCTION mixture(
2080 p UUID, x random_variable, y random_variable)
2081 RETURNS random_variable AS
2082$$
2083DECLARE
2084 token UUID;
2085 p_kind provsql.PROVENANCE_GATE;
2086 x_uuid UUID;
2087 y_uuid UUID;
2088 x_kind provsql.PROVENANCE_GATE;
2089 y_kind provsql.PROVENANCE_GATE;
2090BEGIN
2091 p_kind := provsql.get_gate_type(p);
2092 IF p_kind NOT IN ('input','mulinput','update',
2093 'plus','times','monus',
2094 'project','eq','cmp',
2095 'zero','one') THEN
2096 RAISE EXCEPTION 'provsql.mixture: p must be a Boolean gate '
2097 '(input/mulinput/update/plus/times/monus/project/eq/cmp/zero/one), got %', p_kind;
2098 END IF;
2099
2100 x_uuid := (x)::UUID;
2101 y_uuid := (y)::UUID;
2102 x_kind := provsql.get_gate_type(x_uuid);
2103 y_kind := provsql.get_gate_type(y_uuid);
2104 IF x_kind NOT IN ('rv','value','arith','mixture') THEN
2105 RAISE EXCEPTION 'provsql.mixture: x must be a scalar RV root (rv / value / arith / mixture), got %', x_kind;
2106 END IF;
2107 IF y_kind NOT IN ('rv','value','arith','mixture') THEN
2108 RAISE EXCEPTION 'provsql.mixture: y must be a scalar RV root (rv / value / arith / mixture), got %', y_kind;
2109 END IF;
2110
2111 token := public.uuid_generate_v5(
2112 provsql.uuid_ns_provsql(),
2113 concat('mixture', p, x_uuid, y_uuid));
2114 PERFORM provsql.create_gate(token, 'mixture', ARRAY[p, x_uuid, y_uuid]);
2115 RETURN provsql.random_variable_make(token);
2117$$ LANGUAGE plpgsql STRICT IMMUTABLE PARALLEL SAFE;
2118
2119/**
2120 * @brief Ad-hoc mixture constructor that mints a fresh anonymous
2121 * @c gate_input Bernoulli with probability @p p_value.
2123 * Sugar over the @c mixture(UUID, x, y) form: when the caller doesn't
2124 * care about reusing the Bernoulli token elsewhere in the circuit
2125 * (which is the common case &mdash; "give me a 0.3 / 0.7 weighted GMM,
2126 * I don't need to share the coin"), this overload creates the
2127 * underlying @c gate_input on the fly with a fresh
2128 * @c uuid_generate_v4() token, pins @p p_value via @c set_prob, and
2129 * threads everything into the UUID-keyed constructor.
2130 *
2131 * Each call mints a NEW Bernoulli, so two calls to
2132 * <tt>mixture(0.5, X, Y)</tt> are *independent* mixtures whose branch
2133 * selections are uncorrelated. When coupling is desired (e.g. two
2134 * mixtures sharing a coin), use the @c mixture(UUID, x, y) form with a
2135 * user-managed @c gate_input token.
2136 *
2137 * @warning <tt>VOLATILE</tt> is load-bearing for the same reason as
2138 * @ref normal and the other RV constructors -- folding under
2139 * @c STABLE / @c IMMUTABLE would collapse two independent draws into
2140 * one shared gate.
2141 *
2142 * @sa <a href="https://en.wikipedia.org/wiki/Mixture_distribution">Wikipedia: Mixture distribution</a>
2143 */
2144CREATE OR REPLACE FUNCTION mixture(
2145 p_value double precision,
2146 x random_variable,
2147 y random_variable)
2148 RETURNS random_variable AS
2149$$
2150DECLARE
2151 p_token UUID;
2152BEGIN
2153 IF p_value IS NULL OR p_value <> p_value OR p_value < 0 OR p_value > 1 THEN
2154 RAISE EXCEPTION 'provsql.mixture: probability must be in [0,1] (got %)', p_value;
2155 END IF;
2156 p_token := public.uuid_generate_v4();
2157 PERFORM provsql.create_gate(p_token, 'input');
2158 PERFORM provsql.set_prob(p_token, p_value);
2159 RETURN provsql.mixture(p_token, x, y);
2161$$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE;
2162
2163/**
2164 * @brief Categorical-RV constructor over explicit (probabilities,
2165 * values) arrays.
2167 * Builds a categorical-form @c gate_mixture directly: a fresh
2168 * @c gate_input "key" anchor and one @c gate_mulinput per outcome with
2169 * positive mass, all sharing the key. The wires
2170 * <tt>[key, mul_1, ..., mul_n]</tt> are what downstream evaluators
2171 * (@c Expectation, @c MonteCarloSampler, @c AnalyticEvaluator,
2172 * @c RangeCheck) recognise via @c isCategoricalMixture and treat as a
2173 * scalar RV with the categorical distribution @p probs over
2174 * @p outcomes.
2175 *
2176 * Validation:
2177 * - @p probs and @p outcomes must be non-null, same length, length &ge; 1.
2178 * - Each @c probs[i] must be finite, in <tt>[0, 1]</tt>, and the array
2179 * must sum to 1 within @c 1e-9.
2180 * - Each @c outcomes[i] must be finite.
2181 *
2182 * Each call mints a fresh key gate and a fresh set of mulinputs, so
2183 * two calls to @c categorical with the same arrays are *independent*
2184 * categorical RVs. The marking is @c VOLATILE accordingly.
2185 *
2186 * Degenerate case: a categorical with exactly one positive-mass
2187 * outcome reduces to @c as_random(v) at construction (the block would
2188 * just be a single mulinput, which is operationally a Dirac point
2189 * mass). Two such calls share the @c gate_value UUID via the v5
2190 * convention @c as_random already uses.
2191 *
2192 * @sa @c mixture for the Bernoulli-weighted choice constructor.
2193 * @sa <a href="https://en.wikipedia.org/wiki/Categorical_distribution">Wikipedia: Categorical distribution</a>
2194 */
2195CREATE OR REPLACE FUNCTION categorical(
2196 probs double precision[],
2197 outcomes double precision[])
2198 RETURNS random_variable AS
2199$$
2200DECLARE
2201 n INTEGER;
2202 p_sum double precision := 0.0;
2203 i INTEGER;
2204 key_token UUID;
2205 mix_token UUID;
2206 mul_token UUID;
2207 mul_tokens UUID[] := ARRAY[]::UUID[];
2208 mix_wires UUID[];
2209 pi_i double precision;
2210 vi_i double precision;
2211BEGIN
2212 IF probs IS NULL OR outcomes IS NULL THEN
2213 RAISE EXCEPTION 'provsql.categorical: probs and outcomes must be non-null';
2214 END IF;
2215 n := array_length(probs, 1);
2216 IF n IS NULL OR n < 1 THEN
2217 RAISE EXCEPTION 'provsql.categorical: probs must be non-empty';
2218 END IF;
2219 IF array_length(outcomes, 1) <> n THEN
2220 RAISE EXCEPTION 'provsql.categorical: probs and outcomes must have the same length (got % and %)',
2221 n, array_length(outcomes, 1);
2222 END IF;
2223
2224 FOR i IN 1..n LOOP
2225 pi_i := probs[i];
2226 vi_i := outcomes[i];
2227 -- PostgreSQL diverges from IEEE 754: NaN = NaN is TRUE there, so
2228 -- the canonical x <> x NaN test doesn't fire. Compare against the
2229 -- literal 'NaN'::float8 instead, and reject ±Infinity for outcomes
2230 -- explicitly.
2231 IF pi_i IS NULL OR pi_i = 'NaN'::float8 OR pi_i < 0 OR pi_i > 1 THEN
2232 RAISE EXCEPTION 'provsql.categorical: probs[%] must be in [0,1] (got %)', i, pi_i;
2233 END IF;
2234 IF vi_i IS NULL OR vi_i = 'NaN'::float8
2235 OR vi_i = 'Infinity'::float8 OR vi_i = '-Infinity'::float8 THEN
2236 RAISE EXCEPTION 'provsql.categorical: outcomes[%] must be finite (got %)', i, vi_i;
2237 END IF;
2238 p_sum := p_sum + pi_i;
2239 END LOOP;
2240 IF abs(p_sum - 1.0) > 1e-9 THEN
2241 RAISE EXCEPTION 'provsql.categorical: probs must sum to 1 within 1e-9 (got %)', p_sum;
2242 END IF;
2243
2244 -- Degenerate case: exactly one positive-mass outcome (the rest are
2245 -- zero). The "categorical" is then a Dirac point mass; skip the
2246 -- block-allocation entirely and return @c as_random(v), which yields
2247 -- a shared, v5-keyed gate_value -- exactly what downstream
2248 -- evaluators (rv_moment, AnalyticEvaluator, rv_support) treat
2249 -- specially. Saves a key gate and a mulinput per call, and lets
2250 -- two calls to @c categorical({1.0}, {v}) collide on the same
2251 -- gate_value UUID instead of producing distinct anonymous blocks.
2252 DECLARE
2253 nb_positive INTEGER := 0;
2254 only_idx INTEGER := 0;
2255 BEGIN
2256 FOR i IN 1..n LOOP
2257 IF probs[i] > 0.0 THEN
2258 nb_positive := nb_positive + 1;
2259 only_idx := i;
2260 END IF;
2261 END LOOP;
2262 IF nb_positive = 1 THEN
2263 RETURN provsql.as_random(outcomes[only_idx]);
2264 END IF;
2265 END;
2266
2267 -- Mint the block's key anchor. Probability 1.0 matches the
2268 -- joint-table convention: the categorical mass lives on the
2269 -- mulinputs, the key just identifies the block.
2270 key_token := public.uuid_generate_v4();
2271 PERFORM provsql.create_gate(key_token, 'input');
2272 PERFORM provsql.set_prob(key_token, 1.0);
2273
2274 -- One mulinput per positive-probability outcome. Zero-probability
2275 -- entries contribute no mass and are skipped: the gate_mixture's
2276 -- wire vector is otherwise polluted with no-op leaves.
2277 FOR i IN 1..n LOOP
2278 pi_i := probs[i];
2279 IF pi_i <= 0.0 THEN CONTINUE; END IF;
2280 mul_token := public.uuid_generate_v4();
2281 PERFORM provsql.create_gate(mul_token, 'mulinput', ARRAY[key_token]);
2282 PERFORM provsql.set_prob(mul_token, pi_i);
2283 PERFORM provsql.set_infos(mul_token, (i - 1));
2284 PERFORM provsql.set_extra(mul_token, outcomes[i]::TEXT);
2285 mul_tokens := mul_tokens || mul_token;
2286 END LOOP;
2287
2288 mix_wires := ARRAY[key_token] || mul_tokens;
2289 mix_token := public.uuid_generate_v4();
2290 PERFORM provsql.create_gate(mix_token, 'mixture', mix_wires);
2291 RETURN provsql.random_variable_make(mix_token);
2292END
2293$$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE;
2294
2295/**
2296 * @brief Lift a deterministic constant into a random_variable
2297 *
2298 * Creates a <tt>gate_value</tt> carrying the constant's TEXT form so
2299 * that comparisons against a <tt>random_variable</tt> column produce
2300 * the same circuit shape regardless of whether the operand is an
2301 * actual RV or a literal constant.
2302 *
2303 * Marked <tt>IMMUTABLE</tt>: the gate UUID is derived deterministically
2304 * from the constant via the same v5 convention as <tt>provenance_semimod</tt>'s
2305 * inline value gate (<tt>concat('value', CAST(c AS VARCHAR))</tt>), so
2306 * <tt>as_random(2)</tt> always resolves to the same gate, and any other
2307 * code path that already creates a value gate for the same constant
2308 * (e.g. <tt>provenance_semimod</tt>) shares the UUID.
2309 * <tt>create_gate</tt> is idempotent on already-mapped tokens, so
2310 * repeat invocations are harmless.
2311 *
2312 * @sa <a href="https://en.wikipedia.org/wiki/Degenerate_distribution">Wikipedia: Degenerate distribution (Dirac point mass)</a>
2313 */
2314CREATE OR REPLACE FUNCTION as_random(c double precision)
2315 RETURNS random_variable AS
2316$$
2317DECLARE
2318 -- Canonicalise -0.0 to +0.0: IEEE 754 defines x + 0.0 = +0.0 for
2319 -- both signed zeros, and is identity for finite, NaN, and ±Infinity.
2320 -- Without this, as_random(-0.0) and as_random(+0.0) would produce
2321 -- different gate UUIDs (their CAST AS VARCHAR TEXT representations
2322 -- differ: '-0' vs '0') even though they denote the same constant.
2323 c_canon double precision := c + 0.0;
2324 c_text varchar := CAST(c_canon AS VARCHAR);
2325 token UUID := public.uuid_generate_v5(
2326 provsql.uuid_ns_provsql(), concat('value', c_text));
2327BEGIN
2328 PERFORM provsql.create_gate(token, 'value');
2329 PERFORM provsql.set_extra(token, c_text);
2330 RETURN provsql.random_variable_make(token);
2331END
2332$$ LANGUAGE plpgsql STRICT IMMUTABLE PARALLEL SAFE;
2333
2334/**
2335 * @brief Implicit cast double precision -> random_variable (lifts a
2336 * scalar literal to a constant RV).
2337 *
2338 * Lets users write <tt>WHERE reading > 2.5::float8</tt> instead of
2339 * <tt>WHERE reading > provsql.as_random(2.5)</tt>; the planner-hook
2340 * rewriter then sees a uniform <tt>random_variable</tt> on both sides.
2341 * Sibling casts below cover @c INTEGER and @c NUMERIC literals so
2342 * plain <tt>WHERE reading > 2</tt> and <tt>WHERE reading > 2.5</tt>
2343 * also work — PostgreSQL's operator resolution does not chain casts
2344 * across more than one step, so each NUMERIC-source type needs its
2345 * own direct cast.
2346 */
2347CREATE CAST (double precision AS random_variable)
2348 WITH FUNCTION as_random(double precision) AS IMPLICIT;
2349
2350/** @brief @c as_random for @c INTEGER (delegates to the @c float8 form). */
2351CREATE OR REPLACE FUNCTION as_random(c INTEGER)
2352 RETURNS random_variable AS
2353$$ SELECT provsql.as_random(c::double precision); $$
2354LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE;
2355
2356/** @brief @c as_random for @c NUMERIC (delegates to the @c float8 form). */
2357CREATE OR REPLACE FUNCTION as_random(c NUMERIC)
2358 RETURNS random_variable AS
2359$$ SELECT provsql.as_random(c::double precision); $$
2360LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE;
2362/** @brief Implicit cast INTEGER -> random_variable. */
2363CREATE CAST (INTEGER AS random_variable)
2364 WITH FUNCTION as_random(INTEGER) AS IMPLICIT;
2365
2366/** @brief Implicit cast NUMERIC -> random_variable. */
2367CREATE CAST (NUMERIC AS random_variable)
2368 WITH FUNCTION as_random(NUMERIC) AS IMPLICIT;
2370/**
2371 * @name Arithmetic and comparison on random_variable
2372 *
2373 * Each binary operator below is declared on @c (random_variable,
2374 * random_variable) only; mixed shapes such as <tt>rv + 2</tt> or
2375 * <tt>2.5 > rv</tt> resolve through the implicit casts from
2376 * @c INTEGER / @c NUMERIC / @c double @c precision to
2377 * @c random_variable declared above. This avoids the resolution
2378 * ambiguity that would arise if both <tt>(rv, NUMERIC)</tt> and
2379 * <tt>(rv, rv)</tt> overloads were declared while implicit casts also
2380 * existed.
2381 *
2382 * Arithmetic operators build a @c gate_arith via @c provenance_arith
2383 * and return a new @c random_variable wrapping its UUID.
2384 *
2385 * Comparison operators are placeholders that return @c BOOLEAN and
2386 * raise if executed -- the @c BOOLEAN return type is required so that
2387 * PostgreSQL accepts <tt>WHERE rv > 2</tt> at parse-analyze. The
2388 * planner hook intercepts every such @c OpExpr (matched by
2389 * @c opfuncid against @c constants_t::OID_FUNCTION_RV_CMP) and rewrites
2390 * it into a @c provenance_cmp call whose UUID is conjoined into the
2391 * tuple's @c provsql column via @c provenance_times. Code that needs
2392 * a @c gate_cmp UUID directly (without going through the planner hook)
2393 * uses the @c rv_cmp_* family below, which call @c provenance_cmp
2394 * with the matching float8-comparator OID.
2395 *
2396 * @{
2397 */
2398
2399/** @brief @c random_variable + @c random_variable (gate_arith PLUS). */
2400CREATE OR REPLACE FUNCTION random_variable_plus(
2401 a random_variable, b random_variable)
2402 RETURNS random_variable AS
2403$$
2404 SELECT provsql.random_variable_make(
2405 provsql.provenance_arith(
2406 0, -- PROVSQL_ARITH_PLUS
2407 ARRAY[(a)::UUID,
2408 (b)::UUID]));
2409$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2410
2411/** @brief @c random_variable - @c random_variable (gate_arith MINUS). */
2412CREATE OR REPLACE FUNCTION random_variable_minus(
2413 a random_variable, b random_variable)
2414 RETURNS random_variable AS
2415$$
2416 SELECT provsql.random_variable_make(
2417 provsql.provenance_arith(
2418 2, -- PROVSQL_ARITH_MINUS
2419 ARRAY[(a)::UUID,
2420 (b)::UUID]));
2421$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2423/** @brief @c random_variable * @c random_variable (gate_arith TIMES). */
2424CREATE OR REPLACE FUNCTION random_variable_times(
2425 a random_variable, b random_variable)
2426 RETURNS random_variable AS
2427$$
2428 SELECT provsql.random_variable_make(
2429 provsql.provenance_arith(
2430 1, -- PROVSQL_ARITH_TIMES
2431 ARRAY[(a)::UUID,
2432 (b)::UUID]));
2433$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2435/** @brief @c random_variable / @c random_variable (gate_arith DIV). */
2436CREATE OR REPLACE FUNCTION random_variable_div(
2437 a random_variable, b random_variable)
2438 RETURNS random_variable AS
2439$$
2440 SELECT provsql.random_variable_make(
2441 provsql.provenance_arith(
2442 3, -- PROVSQL_ARITH_DIV
2443 ARRAY[(a)::UUID,
2444 (b)::UUID]));
2445$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2446
2447/** @brief Unary @c -random_variable (gate_arith NEG). */
2448CREATE OR REPLACE FUNCTION random_variable_neg(a random_variable)
2449 RETURNS random_variable AS
2450$$
2451 SELECT provsql.random_variable_make(
2452 provsql.provenance_arith(
2453 4, -- PROVSQL_ARITH_NEG
2454 ARRAY[(a)::UUID]));
2455$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2456
2457/**
2458 * @brief Internal helper: float8-comparator OID for a given symbol.
2459 *
2460 * Wraps the @c '&lt;sym&gt;(double precision,double precision)'::regoperator
2461 * lookup so the per-comparator functions read uniformly. Marked
2462 * @c IMMUTABLE because the resolved OID is fixed at catalog level
2463 * (the float8 comparators are core PG and never re-installed).
2465CREATE OR REPLACE FUNCTION random_variable_cmp_oid(sym TEXT)
2466 RETURNS oid AS
2467$$
2468 SELECT (sym || '(double precision,double precision)')::regoperator::oid;
2469$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2470
2471/* The six @c random_variable_{lt,le,eq,ne,ge,gt} functions below are
2472 * BOOLEAN placeholders -- they exist only so the @c (rv, rv) operators
2473 * can be declared at all (PostgreSQL needs a procedure to bind to the
2474 * operator definition, and a procedure returning anything but @c BOOLEAN
2475 * would be rejected by parse-analyze in a WHERE position). They MUST
2476 * NOT be invoked directly: the planner hook in @c src/provsql.c
2477 * intercepts every @c OpExpr whose @c opfuncid matches one of these and
2478 * rewrites it into a @c provenance_cmp() call against the row's
2479 * provenance. If the executor ever reaches one of these, it means the
2480 * planner hook was bypassed (e.g. @c provsql.active was off), in which
2481 * case raising is the right behaviour. */
2482
2483/** @brief Placeholder body shared by every <tt>random_variable_*</tt>
2484 * comparison procedure. Raises with a uniform message. */
2485CREATE OR REPLACE FUNCTION random_variable_cmp_placeholder(
2486 a random_variable, b random_variable)
2487 RETURNS BOOLEAN AS
2488$$
2489BEGIN
2490 RAISE EXCEPTION 'random_variable comparison must be rewritten by the '
2491 'ProvSQL planner hook (is provsql.active off?)';
2492END
2493$$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
2494
2495CREATE OR REPLACE FUNCTION random_variable_lt(
2496 a random_variable, b random_variable) RETURNS BOOLEAN AS
2497$$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$
2498LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2499
2500CREATE OR REPLACE FUNCTION random_variable_le(
2501 a random_variable, b random_variable) RETURNS BOOLEAN AS
2502$$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$
2503LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2505CREATE OR REPLACE FUNCTION random_variable_eq(
2506 a random_variable, b random_variable) RETURNS BOOLEAN AS
2507$$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$
2508LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2509
2510CREATE OR REPLACE FUNCTION random_variable_ne(
2511 a random_variable, b random_variable) RETURNS BOOLEAN AS
2512$$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$
2513LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2514
2515CREATE OR REPLACE FUNCTION random_variable_ge(
2516 a random_variable, b random_variable) RETURNS BOOLEAN AS
2517$$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$
2518LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2520CREATE OR REPLACE FUNCTION random_variable_gt(
2521 a random_variable, b random_variable) RETURNS BOOLEAN AS
2522$$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$
2523LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2524
2525/* Direct UUID constructors -- used by tests and any caller that wants
2526 * a @c gate_cmp without going through the planner hook (e.g. building
2527 * a circuit fragment in a SELECT list). Each delegates to
2528 * @c provenance_cmp with the matching float8-comparator OID. */
2529
2530/** @brief Build a @c gate_cmp for <tt>a &lt; b</tt> and return its UUID. */
2531CREATE OR REPLACE FUNCTION rv_cmp_lt(
2532 a random_variable, b random_variable) RETURNS UUID AS
2534 SELECT provsql.provenance_cmp(
2535 (a)::UUID,
2536 provsql.random_variable_cmp_oid('<'),
2537 (b)::UUID);
2538$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2539
2540/** @brief Build a @c gate_cmp for <tt>a &le; b</tt> and return its UUID. */
2541CREATE OR REPLACE FUNCTION rv_cmp_le(
2542 a random_variable, b random_variable) RETURNS UUID AS
2543$$
2544 SELECT provsql.provenance_cmp(
2545 (a)::UUID,
2546 provsql.random_variable_cmp_oid('<='),
2547 (b)::UUID);
2548$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2549
2550/** @brief Build a @c gate_cmp for <tt>a = b</tt> and return its UUID. */
2551CREATE OR REPLACE FUNCTION rv_cmp_eq(
2552 a random_variable, b random_variable) RETURNS UUID AS
2553$$
2554 SELECT provsql.provenance_cmp(
2555 (a)::UUID,
2556 provsql.random_variable_cmp_oid('='),
2557 (b)::UUID);
2558$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2559
2560/** @brief Build a @c gate_cmp for <tt>a &lt;&gt; b</tt> and return its UUID. */
2561CREATE OR REPLACE FUNCTION rv_cmp_ne(
2562 a random_variable, b random_variable) RETURNS UUID AS
2563$$
2564 SELECT provsql.provenance_cmp(
2565 (a)::UUID,
2566 provsql.random_variable_cmp_oid('<>'),
2567 (b)::UUID);
2568$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2569
2570/** @brief Build a @c gate_cmp for <tt>a &ge; b</tt> and return its UUID. */
2571CREATE OR REPLACE FUNCTION rv_cmp_ge(
2572 a random_variable, b random_variable) RETURNS UUID AS
2573$$
2574 SELECT provsql.provenance_cmp(
2575 (a)::UUID,
2576 provsql.random_variable_cmp_oid('>='),
2577 (b)::UUID);
2578$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2579
2580/** @brief Build a @c gate_cmp for <tt>a &gt; b</tt> and return its UUID. */
2581CREATE OR REPLACE FUNCTION rv_cmp_gt(
2582 a random_variable, b random_variable) RETURNS UUID AS
2583$$
2584 SELECT provsql.provenance_cmp(
2585 (a)::UUID,
2586 provsql.random_variable_cmp_oid('>'),
2587 (b)::UUID);
2588$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2589
2590CREATE OPERATOR + (
2591 LEFTARG = random_variable,
2592 RIGHTARG = random_variable,
2593 PROCEDURE = random_variable_plus,
2594 COMMUTATOR = +
2595);
2596
2597CREATE OPERATOR - (
2598 LEFTARG = random_variable,
2599 RIGHTARG = random_variable,
2600 PROCEDURE = random_variable_minus
2601);
2602
2603CREATE OPERATOR * (
2604 LEFTARG = random_variable,
2605 RIGHTARG = random_variable,
2606 PROCEDURE = random_variable_times,
2607 COMMUTATOR = *
2608);
2609
2610CREATE OPERATOR / (
2611 LEFTARG = random_variable,
2612 RIGHTARG = random_variable,
2613 PROCEDURE = random_variable_div
2614);
2615
2616/** @brief Prefix unary minus on @c random_variable. */
2617CREATE OPERATOR - (
2618 RIGHTARG = random_variable,
2619 PROCEDURE = random_variable_neg
2620);
2621
2622CREATE OPERATOR < (
2623 LEFTARG = random_variable,
2624 RIGHTARG = random_variable,
2625 PROCEDURE = random_variable_lt,
2626 COMMUTATOR = >,
2627 NEGATOR = >=
2628);
2629
2630CREATE OPERATOR <= (
2631 LEFTARG = random_variable,
2632 RIGHTARG = random_variable,
2633 PROCEDURE = random_variable_le,
2634 COMMUTATOR = >=,
2635 NEGATOR = >
2636);
2637
2638CREATE OPERATOR = (
2639 LEFTARG = random_variable,
2640 RIGHTARG = random_variable,
2641 PROCEDURE = random_variable_eq,
2642 COMMUTATOR = =,
2643 NEGATOR = <>
2644);
2645
2646CREATE OPERATOR <> (
2647 LEFTARG = random_variable,
2648 RIGHTARG = random_variable,
2649 PROCEDURE = random_variable_ne,
2650 COMMUTATOR = <>,
2651 NEGATOR = =
2652);
2653
2654CREATE OPERATOR >= (
2655 LEFTARG = random_variable,
2656 RIGHTARG = random_variable,
2657 PROCEDURE = random_variable_ge,
2658 COMMUTATOR = <=,
2659 NEGATOR = <
2660);
2661
2662CREATE OPERATOR > (
2663 LEFTARG = random_variable,
2664 RIGHTARG = random_variable,
2665 PROCEDURE = random_variable_gt,
2666 COMMUTATOR = <,
2667 NEGATOR = <=
2668);
2669
2670/** @} */
2671
2672/**
2673 * @name Aggregates over random_variable
2674 *
2675 * Phase 1 of the SUM-over-RV story: an overload of the standard
2676 * @c sum aggregate that takes a @c random_variable per row and returns
2677 * the @c random_variable representing the (provenance-weighted) sum.
2678 * Lives in the @c provsql schema so a @c sum(random_variable) call
2679 * resolves to it without colliding with the built-in NUMERIC @c sum
2680 * overloads in @c pg_catalog.
2681 *
2682 * Direct calls outside a provenance-tracked query treat each row's
2683 * contribution unconditionally (no per-row Boolean selector). When
2684 * the planner hook sees a @c provsql.sum @c Aggref over a
2685 * provenance-tracked query, it wraps the per-row argument @c x in
2686 * <tt>provsql.mixture(prov_token, x, provsql.as_random(0))</tt> so the
2687 * aggregate's effective semantics become
2688 * @f$\mathrm{SUM}(x) = \sum_i \mathbf{1}\{\varphi_i\} \cdot X_i@f$,
2689 * the natural extension of semimodule-provenance to RV-valued M.
2690 *
2691 * The internal state is the array of UUIDs of the per-row mixtures.
2692 * The final function builds a single @c gate_arith @c PLUS over them
2693 * (or returns @c as_random(0) for an empty group, the additive
2694 * identity). Sharing on @c provenance_arith's v5 hash means two
2695 * @c sum invocations over the same set of rows collide on the same
2696 * gate.
2697 *
2698 * @{
2699 */
2700
2701/**
2702 * @brief Per-row helper: wrap an RV in @c mixture(prov, rv, as_random(0)).
2703 *
2704 * Internal helper used by the planner-hook rewriter to lift a
2705 * @c sum(random_variable) argument into its provenance-aware form.
2706 * Encodes one row's contribution to the SUM as a Bernoulli mixture
2707 * over the row's provenance: with probability @c P(prov) the mixture
2708 * samples @c rv, otherwise it samples the additive identity
2709 * @c as_random(0). Exposed as a regular SQL function so the planner
2710 * can construct a @c FuncExpr by name without needing to disambiguate
2711 * @c mixture / @c as_random overloads at OID-lookup time.
2712 */
2713CREATE OR REPLACE FUNCTION rv_aggregate_semimod(
2714 prov UUID, rv random_variable)
2715 RETURNS random_variable AS
2716$$
2717 SELECT provsql.mixture(prov, rv, provsql.as_random(0::double precision));
2718$$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE;
2719
2720/**
2721 * @brief State-transition function for @c sum(random_variable).
2722 *
2723 * Appends the input RV's UUID to the running array. NULL inputs are
2724 * skipped (matching standard SUM semantics). The aggregate's INITCOND
2725 * is @c '{}' so the FINALFUNC always runs even on an empty group, which
2726 * is what lets us return @c as_random(0) (the additive identity) for
2727 * an empty SUM rather than NULL.
2728 */
2729CREATE OR REPLACE FUNCTION sum_rv_sfunc(
2730 state UUID[], rv random_variable)
2731 RETURNS UUID[] AS
2732$$
2733 SELECT CASE
2734 WHEN rv IS NULL THEN state
2735 ELSE array_append(state, (rv)::UUID)
2736 END;
2737$$ LANGUAGE sql IMMUTABLE PARALLEL SAFE;
2738
2739/**
2740 * @brief Final function for @c sum(random_variable): build a
2741 * @c gate_arith PLUS root.
2742 *
2743 * Empty group (@c state = @c '{}'): return @c as_random(0), the
2744 * additive identity, so SUM over zero rows is the deterministic
2745 * scalar 0 -- matches the AGG_TOKEN convention in @c agg_raw_moment.
2746 *
2747 * Singleton group: return the single child directly without minting a
2748 * useless single-child @c gate_arith.
2749 *
2750 * Otherwise: build @c gate_arith(PLUS, state) via @c provenance_arith.
2751 */
2752CREATE OR REPLACE FUNCTION sum_rv_ffunc(state UUID[])
2753 RETURNS random_variable AS
2754$$
2755DECLARE
2756 arith_token UUID;
2757BEGIN
2758 IF state IS NULL OR array_length(state, 1) IS NULL THEN
2759 RETURN provsql.as_random(0::double precision);
2760 END IF;
2761 IF array_length(state, 1) = 1 THEN
2762 RETURN provsql.random_variable_make(state[1]);
2763 END IF;
2764 arith_token := provsql.provenance_arith(0, state); -- 0 = PROVSQL_ARITH_PLUS
2765 RETURN provsql.random_variable_make(arith_token);
2766END
2767$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
2768
2769CREATE AGGREGATE sum(random_variable) (
2770 SFUNC = sum_rv_sfunc,
2771 STYPE = UUID[],
2772 INITCOND = '{}',
2773 FINALFUNC = sum_rv_ffunc
2774);
2775
2776/**
2777 * @brief Final function for @c avg(random_variable).
2778 *
2779 * Builds the natural lift of @c "AVG = SUM / COUNT" into the
2780 * @c random_variable algebra:
2781 * @f[
2782 * \mathrm{AVG}(x) \;=\; \frac{\sum_i \mathbf{1}\{\varphi_i\} \cdot X_i}
2783 * {\sum_i \mathbf{1}\{\varphi_i\}}
2784 * @f]
2785 * realised as @c gate_arith(DIV, num, denom) where @c num is the
2786 * @c sum(random_variable) gate over the per-row mixtures and @c denom
2787 * is the @c sum(random_variable) gate over the same provenance gates
2788 * weighted by a per-row @c as_random(1) -- exactly the SQL pattern
2789 * "@c sum(x) @c / @c sum(as_random(1))" emitted as a single
2790 * @c random_variable token.
2791 *
2792 * Reuses @c sum_rv_sfunc as the state-transition function so the
2793 * array of per-row UUIDs is collected identically to
2794 * @c sum(random_variable). In a provenance-tracked query the
2795 * planner-hook rewriter routes RV-returning aggregates through
2796 * @c make_rv_aggregate_expression, which wraps each per-row argument
2797 * in @c mixture(prov_i, x_i, as_random(0)); the FFUNC then recovers
2798 * @c prov_i from each mixture's first child to construct the matching
2799 * @c mixture(prov_i, as_random(1), as_random(0)) for the denominator.
2800 * Outside a tracked query the per-row UUIDs are plain RV roots, in
2801 * which case each row contributes an unconditional @c as_random(1)
2802 * to the denominator -- the natural extension of "no provenance =
2803 * every row counts" used elsewhere in the extension.
2804 *
2805 * Empty group: returns @c NULL, matching the standard SQL @c AVG
2806 * convention. This differs from @c sum(random_variable), which
2807 * returns the additive identity @c as_random(0) for an empty group;
2808 * for AVG the multiplicative identity is not the right answer and
2809 * the caller has no way to disambiguate "0 rows" from "rows that
2810 * sum to 0".
2811 */
2812CREATE OR REPLACE FUNCTION avg_rv_ffunc(state UUID[])
2813 RETURNS random_variable AS
2814$$
2815DECLARE
2816 n INTEGER;
2817 i INTEGER;
2818 num_token UUID;
2819 denom_token UUID;
2820 denom_state UUID[] := '{}';
2821 one_uuid UUID;
2822 gtype provsql.PROVENANCE_GATE;
2823 children UUID[];
2824 prov_i UUID;
2825BEGIN
2826 IF state IS NULL THEN
2827 RETURN NULL;
2828 END IF;
2829 n := array_length(state, 1);
2830 IF n IS NULL THEN
2831 RETURN NULL;
2832 END IF;
2833
2834 one_uuid := (
2835 provsql.as_random(1::double precision))::UUID;
2836
2837 FOR i IN 1..n LOOP
2838 gtype := provsql.get_gate_type(state[i]);
2839 IF gtype = 'mixture'::provsql.PROVENANCE_GATE THEN
2840 children := provsql.get_children(state[i]);
2841 prov_i := children[1];
2842 denom_state := array_append(
2843 denom_state,
2844 (
2845 provsql.rv_aggregate_semimod(
2846 prov_i, provsql.as_random(1::double precision)))::UUID);
2847 ELSE
2848 denom_state := array_append(denom_state, one_uuid);
2849 END IF;
2850 END LOOP;
2851
2852 IF n = 1 THEN
2853 num_token := state[1];
2854 denom_token := denom_state[1];
2855 ELSE
2856 num_token := provsql.provenance_arith(0, state); -- 0 = PLUS
2857 denom_token := provsql.provenance_arith(0, denom_state); -- 0 = PLUS
2858 END IF;
2859
2860 RETURN provsql.random_variable_make(
2861 provsql.provenance_arith(
2862 3, -- 3 = PROVSQL_ARITH_DIV
2863 ARRAY[num_token, denom_token]));
2864END
2865$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
2866
2867CREATE AGGREGATE avg(random_variable) (
2868 SFUNC = sum_rv_sfunc,
2869 STYPE = UUID[],
2870 INITCOND = '{}',
2871 FINALFUNC = avg_rv_ffunc
2872);
2873
2874/**
2875 * @brief Final function for @c product(random_variable).
2876 *
2877 * Multiplicative analogue of @c sum(random_variable):
2878 * @f[
2879 * \mathrm{PRODUCT}(x) \;=\; \prod_i \big(\mathbf{1}\{\varphi_i\} \cdot X_i
2880 * + \mathbf{1}\{\neg\varphi_i\} \cdot 1\big)
2881 * \;=\; \prod_{i : \varphi_i} X_i
2882 * @f]
2883 * realised as @c gate_arith(TIMES, mixtures) over per-row contributions
2884 * whose @em else-branch is @c as_random(1) (the multiplicative
2885 * identity), so rows whose provenance is false contribute @c 1 to the
2886 * product instead of @c 0.
2887 *
2888 * The C-side wrap shared with @c sum / @c avg always builds
2889 * @c mixture(prov_i, X_i, as_random(0)); the PRODUCT FFUNC patches each
2890 * mixture's else-branch to @c as_random(1) by reconstructing the
2891 * mixture with the corrected else-arg. Going through
2892 * @c provsql.mixture (rather than @c create_gate directly) keeps the
2893 * gate v5-hash consistent with any other mixture sharing the same
2894 * @c (prov_i, X_i, as_random(1)) triple.
2895 *
2896 * Reuses @c sum_rv_sfunc as the state-transition function. Empty
2897 * group: returns the multiplicative identity @c as_random(1) -- the
2898 * natural counterpart to @c sum(random_variable)'s empty-group
2899 * @c as_random(0).
2900 *
2901 * Singleton group: returns the single patched child directly without
2902 * minting a useless single-child @c gate_arith TIMES root.
2903 *
2904 * Direct (untracked) call: state entries are raw RV uuids rather than
2905 * mixtures; pass them through unchanged so PRODUCT degenerates to the
2906 * straight RV product over all rows, the natural "no provenance =
2907 * every row counts" behaviour.
2908 */
2909CREATE OR REPLACE FUNCTION product_rv_ffunc(state UUID[])
2910 RETURNS random_variable AS
2911$$
2912DECLARE
2913 n INTEGER;
2914 i INTEGER;
2915 prod_state UUID[] := '{}';
2916 one_rv provsql.random_variable;
2917 gtype provsql.PROVENANCE_GATE;
2918 children UUID[];
2919 prov_i UUID;
2920 x_uuid UUID;
2921BEGIN
2922 one_rv := provsql.as_random(1::double precision);
2923
2924 IF state IS NULL THEN
2925 RETURN one_rv;
2926 END IF;
2927 n := array_length(state, 1);
2928 IF n IS NULL THEN
2929 RETURN one_rv;
2930 END IF;
2931
2932 FOR i IN 1..n LOOP
2933 gtype := provsql.get_gate_type(state[i]);
2934 IF gtype = 'mixture'::provsql.PROVENANCE_GATE THEN
2935 children := provsql.get_children(state[i]);
2936 prov_i := children[1];
2937 x_uuid := children[2];
2938 prod_state := array_append(
2939 prod_state,
2940 (
2941 provsql.mixture(
2942 prov_i,
2943 provsql.random_variable_make(x_uuid),
2944 one_rv))::UUID);
2945 ELSE
2946 prod_state := array_append(prod_state, state[i]);
2947 END IF;
2948 END LOOP;
2949
2950 IF n = 1 THEN
2951 RETURN provsql.random_variable_make(prod_state[1]);
2952 END IF;
2953 RETURN provsql.random_variable_make(
2954 provsql.provenance_arith(1, prod_state)); -- 1 = PROVSQL_ARITH_TIMES
2955END
2956$$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE;
2957
2958CREATE AGGREGATE product(random_variable) (
2959 SFUNC = sum_rv_sfunc,
2960 STYPE = UUID[],
2961 INITCOND = '{}',
2962 FINALFUNC = product_rv_ffunc
2963);
2964
2965/** @} */
2966
2967/** @} */
2968
2969/** @defgroup aggregate_provenance Aggregate provenance
2970 * Functions for building and evaluating aggregate (GROUP BY) provenance,
2971 * including the δ-semiring operator and semimodule multiplication.
2972 * @{
2973 */
2974
2975/**
2976 * @brief Create a δ-semiring gate wrapping a provenance token
2977 *
2978 * Used internally for aggregate provenance. Returns the token unchanged
2979 * if it is gate_zero() or gate_one(), and gate_one() if the token is NULL.
2980 */
2981CREATE OR REPLACE FUNCTION provenance_delta
2982 (token UUID)
2983 RETURNS UUID AS
2984$$
2985DECLARE
2986 delta_token UUID;
2987BEGIN
2988 IF token = gate_zero() OR token = gate_one() THEN
2989 return token;
2990 END IF;
2991
2992 IF token IS NULL THEN
2993 return gate_one();
2994 END IF;
2995
2996 delta_token:=uuid_generate_v5(uuid_ns_provsql(),concat('delta',token));
2997
2998 PERFORM create_gate(delta_token,'delta',ARRAY[token::UUID]);
2999
3000 RETURN delta_token;
3001END
3002$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE;
3003
3004/**
3005 * @brief Build an aggregate provenance gate from grouped tokens
3006 *
3007 * Called internally by the query rewriter for GROUP BY queries.
3008 * Creates an agg gate linking all contributing tokens and records
3009 * the aggregate function OID and the computed scalar value.
3010 *
3011 * @param aggfnoid OID of the SQL aggregate function
3012 * @param aggtype OID of the aggregate result type
3013 * @param val computed aggregate value
3014 * @param tokens array of provenance tokens being aggregated
3015 */
3016CREATE OR REPLACE FUNCTION provenance_aggregate(
3017 aggfnoid INTEGER,
3018 aggtype INTEGER,
3019 val ANYELEMENT,
3020 tokens UUID[])
3021 RETURNS AGG_TOKEN AS
3022$$
3023DECLARE
3024 c INTEGER;
3025 agg_tok UUID;
3026 agg_val varchar;
3027BEGIN
3028 c:=COALESCE(array_length(tokens, 1), 0);
3029
3030 agg_val = CAST(val as VARCHAR);
3031
3032 IF c = 0 THEN
3033 agg_tok := gate_zero();
3034 ELSE
3035 -- aggfnoid must be part of the UUID: SUM(id) and AVG(id) over the
3036 -- same children would otherwise collapse to a single gate, and
3037 -- their concurrent set_infos calls would overwrite each other's
3038 -- aggregation operator (resulting in the wrong agg_kind being
3039 -- read by provsql_having under cross-backend contention).
3040 agg_tok := uuid_generate_v5(
3041 uuid_ns_provsql(),
3042 concat('agg',aggfnoid,tokens));
3043 PERFORM create_gate(agg_tok, 'agg', tokens);
3044 PERFORM set_infos(agg_tok, aggfnoid, aggtype);
3045 PERFORM set_extra(agg_tok, agg_val);
3046 END IF;
3047
3048 RETURN '( '||agg_tok||' , '||agg_val||' )';
3049END
3050$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql,pg_temp,public SECURITY DEFINER;
3051
3052/**
3053 * @brief Create a semimodule scalar multiplication gate
3054 *
3055 * Pairs a scalar value with a provenance token, used internally by
3056 * the query rewriter for aggregate provenance.
3057 *
3058 * @param val the scalar value
3059 * @param token the provenance token to multiply
3060 */
3061CREATE OR REPLACE FUNCTION provenance_semimod(val ANYELEMENT, token UUID)
3062 RETURNS UUID AS
3063$$
3064DECLARE
3065 semimod_token UUID;
3066 value_token UUID;
3067BEGIN
3068 SELECT uuid_generate_v5(uuid_ns_provsql(),concat('value',CAST(val AS VARCHAR)))
3069 INTO value_token;
3070 SELECT uuid_generate_v5(uuid_ns_provsql(),concat('semimod',value_token,token))
3071 INTO semimod_token;
3072
3073 --create value gates
3074 PERFORM create_gate(value_token,'value');
3075 PERFORM set_extra(value_token, CAST(val AS VARCHAR));
3076
3077 --create semimod gate
3078 PERFORM create_gate(semimod_token,'semimod',ARRAY[token::UUID,value_token]);
3079
3080 RETURN semimod_token;
3081END
3082$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql,pg_temp,public SECURITY DEFINER;
3083
3084/** @} */
3085
3086/** @defgroup probability Probability and Shapley values
3087 * Functions for computing probabilities, expected values, and
3088 * game-theoretic contribution measures (Shapley/Banzhaf values)
3089 * from provenance circuits.
3090 * @{
3091 */
3092
3093/**
3094 * @brief Compute the probability of a provenance token
3095 *
3096 * Compiles the provenance circuit to d-DNNF and evaluates the
3097 * probability. The compilation method can be selected explicitly.
3098 *
3099 * @param token provenance token to evaluate
3100 * @param method knowledge compilation method (NULL for default)
3101 * @param arguments additional arguments for the method
3102 */
3103CREATE OR REPLACE FUNCTION probability_evaluate(
3104 token UUID,
3105 method TEXT = NULL,
3106 arguments TEXT = NULL)
3107 RETURNS DOUBLE PRECISION AS
3108 'provsql','probability_evaluate' LANGUAGE C STABLE;
3109
3110/**
3111 * @brief Compute the expected value of a probabilistic scalar
3112 *
3113 * Computes E[input | prov] for either an @c AGG_TOKEN (discrete
3114 * SUM/MIN/MAX aggregation over Boolean-input gate_agg circuits, with
3115 * @c prov as the Boolean conditioning event) or a @c random_variable
3116 * (continuous distribution, traversed by the analytical / MC
3117 * evaluator from @c Expectation.cpp).
3118 *
3119 * Implementation: thin wrapper over @c moment(input, 1, prov, method,
3120 * arguments). Both branches converge on the same machinery; the
3121 * AGG_TOKEN side computes E[X] as the @f$k=1@f$ instance of the
3122 * @f$n^k@f$-tuple enumeration in @c agg_raw_moment, the
3123 * random_variable side calls @c compute_expectation through
3124 * @c rv_moment.
3125 *
3126 * @param input aggregate expression or random variable to compute E[·] of
3127 * @param prov provenance condition (defaults to gate_one(), i.e., unconditional)
3128 * @param method knowledge compilation method (AGG_TOKEN path only)
3129 * @param arguments additional arguments for the method (AGG_TOKEN path only)
3130 */
3131CREATE OR REPLACE FUNCTION expected(
3132 input ANYELEMENT,
3133 prov UUID = gate_one(),
3134 method TEXT = NULL,
3135 arguments TEXT = NULL)
3136 RETURNS DOUBLE PRECISION AS $$
3137 SELECT moment(input, 1, prov, method, arguments);
3138$$ LANGUAGE sql PARALLEL SAFE STABLE SET search_path=provsql SECURITY DEFINER;
3139
3140/**
3141 * @brief Internal: shared C entry point for variance / moment / central_moment.
3142 *
3143 * The @c expected() SQL function reaches the Expectation evaluator
3144 * through @c provenance_evaluate_compiled(..., 'expectation', ...).
3145 * The variance / raw-moment / central-moment SQL functions need an
3146 * extra @p k INTEGER argument that does not fit that dispatcher's
3147 * signature, so they go through this dedicated entry point. Returns
3148 * E[X^k] when @p central is FALSE, or E[(X - E[X])^k] when TRUE.
3149 */
3150CREATE OR REPLACE FUNCTION rv_moment(
3151 token UUID, k INTEGER, central BOOLEAN,
3152 prov UUID DEFAULT gate_one())
3153 RETURNS double precision
3154 AS 'provsql','rv_moment' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
3155
3156/**
3157 * @brief Compute the raw moment E[X^k | prov] of an AGG_TOKEN aggregate
3158 *
3159 * Sister of @c expected() for the AGG_TOKEN side of the polymorphic
3160 * @c moment / @c variance / @c central_moment dispatch. Supports the
3161 * same aggregation functions as @c expected: SUM (which COUNT
3162 * normalises to at the gate level via @c Aggregation.cpp:322), MIN,
3163 * and MAX.
3164 *
3165 * Strategy:
3166 * - <b>SUM</b>: with X = Σᵢ Iᵢ·vᵢ (Iᵢ the per-row inclusion indicator,
3167 * vᵢ the row's value), expanding X^k and taking expectation gives
3168 * @f$E[X^k] = \sum_{(i_1,\ldots,i_k) \in \{1..n\}^k} v_{i_1}\cdots v_{i_k}
3169 * \cdot P(\bigwedge_{i \in \TEXT{distinct}(i_1..i_k)} I_i)@f$.
3170 * We enumerate the @f$n^k@f$ tuples, conjoin the distinct inclusion
3171 * tokens (and @p prov when conditioning), and evaluate the
3172 * probability via @c probability_evaluate.
3173 * - <b>MIN / MAX</b>: replace @c v with @c v^k in the rank-based
3174 * enumeration that @c expected already uses; @c MAX is handled by
3175 * sign-flipping per the existing trick (negate vs. rerank), with
3176 * the outer multiplier becoming @f$(-1)^k@f$ instead of just @f$-1@f$.
3177 *
3178 * Cost: SUM is @f$O(n^k)@f$ probability evaluations -- tractable for
3179 * small @p k or small @p n; for larger sizes, prefer reaching for the
3180 * sampler. MIN / MAX stay linear in @p n.
3181 */
3182CREATE OR REPLACE FUNCTION agg_raw_moment(
3183 token AGG_TOKEN,
3184 k INTEGER,
3185 prov UUID = gate_one(),
3186 method TEXT = NULL,
3187 arguments TEXT = NULL)
3188 RETURNS DOUBLE PRECISION AS $$
3189DECLARE
3190 aggregation_function VARCHAR;
3191 child_pairs UUID[];
3192 pair_children UUID[];
3193 n INTEGER;
3194 i INTEGER;
3195 j INTEGER;
3196 vals float8[];
3197 toks UUID[];
3198 total float8;
3199 total_probability float8;
3200 tup INTEGER[];
3201 d INTEGER;
3202 prod_v float8;
3203 distinct_tok UUID[];
3204 conj_token UUID;
3205 prob float8;
3206 sign_max float8;
3207BEGIN
3208 IF token IS NULL OR k IS NULL THEN
3209 RETURN NULL;
3210 END IF;
3211 IF k < 0 THEN
3212 RAISE EXCEPTION 'agg_raw_moment(): k must be non-negative (got %)', k;
3213 END IF;
3214 IF get_gate_type(token) <> 'agg' THEN
3215 RAISE EXCEPTION USING MESSAGE='Wrong gate type for agg_raw_moment computation';
3216 END IF;
3217 IF k = 0 THEN
3218 RETURN 1;
3219 END IF;
3220
3221 SELECT pp.proname::varchar FROM pg_proc pp
3222 WHERE oid=(get_infos(token)).info1
3223 INTO aggregation_function;
3224
3225 child_pairs := get_children(token);
3226 n := COALESCE(array_length(child_pairs, 1), 0);
3227
3228 IF aggregation_function = 'sum' THEN
3229 -- Trivial empty aggregation: SUM = 0, so SUM^k = 0 for k >= 1.
3230 -- Note: AGG_TOKEN semantics treat the "no row included" world as
3231 -- SUM = 0, so this stays consistent with k = 1 (= expected()).
3232 IF n = 0 THEN
3233 RETURN 0;
3234 END IF;
3235
3236 -- Extract per-child token + value arrays.
3237 vals := ARRAY[]::float8[];
3238 toks := ARRAY[]::UUID[];
3239 FOR i IN 1..n LOOP
3240 pair_children := get_children(child_pairs[i]);
3241 toks := toks || pair_children[1];
3242 vals := vals || CAST(get_extra(pair_children[2]) AS float8);
3243 END LOOP;
3244
3245 -- Enumerate all k-tuples (i_1, ..., i_k) in {1..n}^k. tup is the
3246 -- current tuple; we step through them in lexicographic order.
3247 total := 0;
3248 tup := array_fill(1, ARRAY[k]);
3249 LOOP
3250 prod_v := 1;
3251 FOR j IN 1..k LOOP
3252 prod_v := prod_v * vals[tup[j]];
3253 END LOOP;
3254
3255 SELECT array_agg(DISTINCT toks[idx]) INTO distinct_tok
3256 FROM unnest(tup) AS idx;
3257
3258 IF prov <> gate_one() THEN
3259 distinct_tok := distinct_tok || prov;
3260 END IF;
3261 conj_token := provenance_times(VARIADIC distinct_tok);
3262 prob := probability_evaluate(conj_token, method, arguments);
3263
3264 total := total + prod_v * prob;
3265
3266 d := k;
3267 WHILE d >= 1 AND tup[d] = n LOOP
3268 tup[d] := 1;
3269 d := d - 1;
3270 END LOOP;
3271 EXIT WHEN d = 0;
3272 tup[d] := tup[d] + 1;
3273 END LOOP;
3274 ELSIF aggregation_function = 'min' OR aggregation_function = 'max' THEN
3275 -- Rank enumeration: per distinct value v, P(MIN = v) is the
3276 -- probability that some t_i with v_i=v is true and all t_j with
3277 -- smaller v are false. For MAX we negate values so the same
3278 -- "smaller-than" rank logic computes MIN-of-negated, then flip.
3279 -- The outer multiplier picks up the right sign for the k-th moment
3280 -- of MAX: E[MAX^k] = (-1)^k * E[MIN(-v)^k], so sign_max = (-1)^k.
3281 sign_max := CASE
3282 WHEN aggregation_function = 'max'
3283 THEN power(-1::float8, k)
3284 ELSE 1
3285 END;
3286
3287 -- ±Infinity sink: positive probability of "no row included" makes
3288 -- MIN = +Inf and MAX = -Inf. For MIN^k that's +Inf for any k>=1;
3289 -- for MAX^k it's (-1)^k * (+Inf) = ±Inf depending on k's parity.
3290 WITH tok_value AS (
3291 SELECT (get_children(c))[1] AS tok,
3292 (CASE WHEN aggregation_function='max' THEN -1 ELSE 1 END)
3293 * CAST(get_extra((get_children(c))[2]) AS DOUBLE PRECISION) AS v
3294 FROM UNNEST(child_pairs) AS c
3295 ) SELECT probability_evaluate(provenance_monus(prov, provenance_plus(ARRAY_AGG(tok))))
3296 FROM tok_value
3297 INTO total_probability;
3298
3299 IF total_probability > epsilon() THEN
3300 total := sign_max * 'Infinity'::float8;
3301 ELSE
3302 WITH tok_value AS (
3303 SELECT (get_children(c))[1] AS tok,
3304 (CASE WHEN aggregation_function='max' THEN -1 ELSE 1 END)
3305 * CAST(get_extra((get_children(c))[2]) AS DOUBLE PRECISION) AS v
3306 FROM UNNEST(child_pairs) AS c
3307 ) SELECT sign_max * SUM(p * power(v, k)) FROM (
3308 SELECT t1.v AS v,
3309 probability_evaluate(
3310 provenance_monus(provenance_plus(ARRAY_AGG(t1.tok)),
3311 provenance_plus(ARRAY_AGG(t2.tok))),
3312 method, arguments) AS p
3313 FROM tok_value t1 LEFT OUTER JOIN tok_value t2 ON t1.v > t2.v
3314 GROUP BY t1.v) tmp
3315 INTO total;
3316 END IF;
3317 ELSE
3318 RAISE EXCEPTION USING MESSAGE=
3319 'Cannot compute moment for aggregation function ' || aggregation_function;
3320 END IF;
3321
3322 -- Conditional normalisation: E[X^k · 1_A] / P(A) = E[X^k | A].
3323 IF prov <> gate_one()
3324 AND total <> 0
3325 AND total <> 'Infinity'::float8
3326 AND total <> '-Infinity'::float8 THEN
3327 total := total / probability_evaluate(prov, method, arguments);
3328 END IF;
3329
3330 RETURN total;
3331END
3332$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER;
3333
3334/**
3335 * @brief Compute the variance Var[X | prov] of a probabilistic scalar
3336 *
3337 * Polymorphic dispatcher that mirrors @c expected: @c random_variable
3338 * inputs go through the analytical / MC evaluator
3339 * (@c rv_moment(UUID, 2, true)); @c AGG_TOKEN inputs go through the
3340 * @c agg_raw_moment helper, computing
3341 * @f$\mathrm{Var}[X|A] = E[X^2|A] - E[X|A]^2@f$. Conditioning on
3342 * @c prov is supported for @c AGG_TOKEN (matching @c expected) but
3343 * not yet for @c random_variable.
3344 */
3345CREATE OR REPLACE FUNCTION variance(
3346 input ANYELEMENT,
3347 prov UUID = gate_one(),
3348 method TEXT = NULL,
3349 arguments TEXT = NULL)
3350 RETURNS DOUBLE PRECISION AS $$
3351DECLARE
3352 m1 float8;
3353 m2 float8;
3354BEGIN
3355 IF pg_typeof(input) = 'random_variable'::REGTYPE THEN
3356 IF input IS NULL THEN
3357 RETURN NULL;
3358 END IF;
3359 -- Conditioning on prov is handled inside rv_moment: when prov
3360 -- resolves to gate_one() (the default, or load-time
3361 -- simplification of any always-true sub-circuit) the
3362 -- unconditional analytical path runs unchanged; otherwise the
3363 -- joint-circuit loader unifies shared gate_rv leaves between
3364 -- input and prov, and the conditional path runs either
3365 -- truncated-distribution closed form or MC rejection.
3366 RETURN provsql.rv_moment(
3367 (input::random_variable)::UUID, 2, true, prov);
3368 END IF;
3369
3370 IF pg_typeof(input) = 'AGG_TOKEN'::REGTYPE THEN
3371 IF input IS NULL THEN
3372 RETURN NULL;
3373 END IF;
3374 m1 := agg_raw_moment(input::AGG_TOKEN, 1, prov, method, arguments);
3375 m2 := agg_raw_moment(input::AGG_TOKEN, 2, prov, method, arguments);
3376 IF m1 IS NULL OR m2 IS NULL THEN
3377 RETURN NULL;
3378 END IF;
3379 RETURN m2 - m1 * m1;
3380 END IF;
3381
3382 RAISE EXCEPTION 'variance() is not yet supported for input type %', pg_typeof(input);
3383END
3384$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER;
3385
3386/**
3387 * @brief Compute the raw moment E[X^k | prov] of a probabilistic scalar
3388 *
3389 * @c k must be a non-negative INTEGER. @c k = 0 returns 1; @c k = 1
3390 * is equivalent to @c expected(input). Polymorphic dispatcher: routes
3391 * @c random_variable through @c rv_moment (analytical / MC) and
3392 * @c AGG_TOKEN through @c agg_raw_moment (SUM via tuple enumeration,
3393 * MIN / MAX via rank enumeration).
3394 */
3395CREATE OR REPLACE FUNCTION moment(
3396 input ANYELEMENT,
3397 k INTEGER,
3398 prov UUID = gate_one(),
3399 method TEXT = NULL,
3400 arguments TEXT = NULL)
3401 RETURNS DOUBLE PRECISION AS $$
3402BEGIN
3403 IF pg_typeof(input) = 'random_variable'::REGTYPE THEN
3404 IF input IS NULL OR k IS NULL THEN
3405 RETURN NULL;
3406 END IF;
3407 -- See variance() above: rv_moment handles the conditional/unconditional
3408 -- dispatch internally based on the resolved prov gate type.
3409 RETURN provsql.rv_moment(
3410 (input::random_variable)::UUID, k, false, prov);
3411 END IF;
3412
3413 IF pg_typeof(input) = 'AGG_TOKEN'::REGTYPE THEN
3414 RETURN agg_raw_moment(input::AGG_TOKEN, k, prov, method, arguments);
3415 END IF;
3416
3417 RAISE EXCEPTION 'moment() is not yet supported for input type %', pg_typeof(input);
3418END
3419$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER;
3420
3421/**
3422 * @brief Internal: rv-side support computation
3423 *
3424 * Lifts @c provsql.compute_support out of @c RangeCheck.cpp -- the
3425 * same interval-arithmetic propagation @c runRangeCheck uses to
3426 * decide @c gate_cmps. Returns @c [-Infinity, +Infinity] when the
3427 * tightest bound is the conservative all-real interval (e.g. for a
3428 * normal RV, or any sub-circuit that mixes a normal in).
3429 */
3430CREATE OR REPLACE FUNCTION rv_support(
3431 token UUID, prov UUID DEFAULT gate_one(),
3432 OUT lo float8, OUT hi float8)
3433 AS 'provsql','rv_support' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
3434
3435/**
3436 * @brief Compute the support interval @c [lo, hi] of a probabilistic
3437 * (or deterministic) scalar
3438 *
3439 * Polymorphic dispatcher mirroring @c expected / @c variance /
3440 * @c moment / @c central_moment, with two extra "free" branches:
3441 *
3442 * - <b>Plain NUMERIC</b> (@c smallint / @c INTEGER / @c bigint /
3443 * @c NUMERIC / @c real / @c double @c precision): degenerate
3444 * point support @f$[c, c]@f$. Lets callers ask for the support
3445 * of a literal without round-tripping through @c as_random.
3446 * - <b>@c random_variable / bare @c UUID</b> (any provenance gate
3447 * token, including the implicit @c random_variable @c -> @c UUID
3448 * cast): routes to @c rv_support, which propagates distribution
3449 * supports (uniform exact, exponential @c [0,+∞), normal
3450 * @c (-∞,+∞)) through @c gate_arith via interval arithmetic.
3451 * @c gate_value gives the same @f$[c, c]@f$ point support as the
3452 * NUMERIC branch; any non-scalar gate (Boolean gates, aggregates,
3453 * ...) safely falls back to the conservative all-real interval
3454 * without raising. Conditioning on @c prov is not yet supported.
3455 *
3456 * - @c AGG_TOKEN: closed-form per aggregation function:
3457 * - @c SUM : @f$[\sum_i \min(0,v_i), \sum_i \max(0,v_i)]@f$
3458 * (every row is independently in or out of the included set; the
3459 * extreme SUMs are reached by including only positive or only
3460 * negative-valued rows).
3461 * - @c MIN : @f$[\min_i v_i, \max_i v_i]@f$ in the non-empty
3462 * subsets, plus @c +Infinity if the empty subset has positive
3463 * probability under @c prov.
3464 * - @c MAX : symmetric -- @c -Infinity if empty has positive
3465 * probability under @c prov, otherwise @c min_i v_i; @c hi is
3466 * always @c max_i v_i.
3467 *
3468 * Other aggregation functions raise.
3469 *
3470 * Returns the composite RECORD @c (lo, hi) via the function's
3471 * @c OUT parameters, with @c -Infinity / @c +Infinity marking
3472 * unbounded ends.
3473 */
3474CREATE OR REPLACE FUNCTION support(
3475 input ANYELEMENT,
3476 prov UUID = gate_one(),
3477 method TEXT = NULL,
3478 arguments TEXT = NULL,
3479 OUT lo float8,
3480 OUT hi float8)
3481 AS $$
3482DECLARE
3483 aggregation_function VARCHAR;
3484 child_pairs UUID[];
3485 values_arr float8[];
3486 total_probability float8;
3487BEGIN
3488 IF input IS NULL THEN
3489 lo := NULL; hi := NULL; RETURN;
3490 END IF;
3491
3492 -- Plain NUMERIC: degenerate point support. Lets `support(2.5)` /
3493 -- `support(42)` / etc. return (2.5, 2.5) without making the user
3494 -- wrap in `as_random`.
3495 IF pg_typeof(input) IN (
3496 'smallint'::REGTYPE, 'INTEGER'::REGTYPE, 'bigint'::REGTYPE,
3497 'NUMERIC'::REGTYPE, 'real'::REGTYPE, 'double precision'::REGTYPE) THEN
3498 lo := input::double precision;
3499 hi := input::double precision;
3500 RETURN;
3501 END IF;
3502
3503 -- random_variable has an IMPLICIT cast to UUID, so a single
3504 -- rv_support call covers both shapes. rv_support handles
3505 -- gate_value (point), gate_rv (distribution), gate_arith
3506 -- (propagated), and falls back to the conservative all-real
3507 -- interval for any other gate kind. Conditioning on prov is not
3508 -- supported (would require restricting the underlying joint
3509 -- distribution by the indicator of prov, which has no closed form
3510 -- for the basic distributions we ship).
3511 IF pg_typeof(input) IN ('random_variable'::REGTYPE, 'UUID'::REGTYPE) THEN
3512 -- Conditional support: rv_support folds the AND-conjunct interval
3513 -- constraints from prov into the unconditional support. When
3514 -- prov is gate_one() the unconditional support is returned
3515 -- unchanged.
3516 SELECT r.lo, r.hi INTO lo, hi
3517 FROM provsql.rv_support(input::UUID, prov) r;
3518 RETURN;
3519 END IF;
3520
3521 IF pg_typeof(input) = 'AGG_TOKEN'::REGTYPE THEN
3522 IF get_gate_type(input::AGG_TOKEN) <> 'agg' THEN
3523 RAISE EXCEPTION USING MESSAGE='Wrong gate type for support computation';
3524 END IF;
3525 SELECT pp.proname::varchar FROM pg_proc pp
3526 WHERE oid=(get_infos(input::AGG_TOKEN)).info1
3527 INTO aggregation_function;
3528 child_pairs := get_children(input::AGG_TOKEN);
3529
3530 IF aggregation_function = 'sum' THEN
3531 -- Empty AGG_TOKEN: SUM is identically 0.
3532 IF COALESCE(array_length(child_pairs, 1), 0) = 0 THEN
3533 lo := 0; hi := 0; RETURN;
3534 END IF;
3535 SELECT sum(LEAST(v, 0::float8)), sum(GREATEST(v, 0::float8))
3536 INTO lo, hi
3537 FROM (SELECT CAST(get_extra((get_children(c))[2]) AS float8) AS v
3538 FROM unnest(child_pairs) AS c) sub;
3539 ELSIF aggregation_function = 'min' OR aggregation_function = 'max' THEN
3540 -- Empty AGG_TOKEN: MIN = +Infinity, MAX = -Infinity (the
3541 -- empty-set conventions used by `expected`).
3542 IF COALESCE(array_length(child_pairs, 1), 0) = 0 THEN
3543 IF aggregation_function = 'min' THEN
3544 lo := 'Infinity'::float8; hi := 'Infinity'::float8;
3545 ELSE
3546 lo := '-Infinity'::float8; hi := '-Infinity'::float8;
3547 END IF;
3548 RETURN;
3549 END IF;
3550
3551 WITH tok_value AS (
3552 SELECT (get_children(c))[1] AS tok,
3553 CAST(get_extra((get_children(c))[2]) AS float8) AS v
3554 FROM UNNEST(child_pairs) AS c
3555 )
3556 SELECT min(v), max(v),
3557 probability_evaluate(
3558 provenance_monus(prov, provenance_plus(ARRAY_AGG(tok))),
3559 method, arguments)
3560 INTO lo, hi, total_probability
3561 FROM tok_value;
3562
3563 IF total_probability > epsilon() THEN
3564 IF aggregation_function = 'min' THEN
3565 hi := 'Infinity'::float8;
3566 ELSE
3567 lo := '-Infinity'::float8;
3568 END IF;
3569 END IF;
3570 ELSE
3571 RAISE EXCEPTION USING MESSAGE=
3572 'Cannot compute support for aggregation function ' || aggregation_function;
3573 END IF;
3574 RETURN;
3575 END IF;
3576
3577 RAISE EXCEPTION 'support() is not yet supported for input type %', pg_typeof(input);
3578END
3579$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER;
3580
3581/**
3582 * @brief Compute the central moment E[(X - E[X|prov])^k | prov]
3583 *
3584 * @c k = 0 returns 1; @c k = 1 returns 0; @c k = 2 is equivalent to
3585 * @c variance(input, prov, ...). Polymorphic dispatcher: routes
3586 * @c random_variable through @c rv_moment, and @c AGG_TOKEN through
3587 * the binomial expansion
3588 * @f$E[(X-\mu)^k|A] = \sum_{i=0}^{k} \binom{k}{i} (-\mu)^{k-i} E[X^i|A]@f$
3589 * with @f$\mu = E[X|A]@f$, where each @f$E[X^i|A]@f$ comes from
3590 * @c agg_raw_moment.
3591 */
3592CREATE OR REPLACE FUNCTION central_moment(
3593 input ANYELEMENT,
3594 k INTEGER,
3595 prov UUID = gate_one(),
3596 method TEXT = NULL,
3597 arguments TEXT = NULL)
3598 RETURNS DOUBLE PRECISION AS $$
3599DECLARE
3600 mu float8;
3601 total float8;
3602 i INTEGER;
3603 raw_i float8;
3604 binom float8;
3605 -- iterative binomial coefficient C(k, i)
3606 k_double float8;
3607BEGIN
3608 IF pg_typeof(input) = 'random_variable'::REGTYPE THEN
3609 IF input IS NULL OR k IS NULL THEN
3610 RETURN NULL;
3611 END IF;
3612 -- See variance() above: rv_moment handles the conditional/unconditional
3613 -- dispatch internally based on the resolved prov gate type.
3614 RETURN provsql.rv_moment(
3615 (input::random_variable)::UUID, k, true, prov);
3616 END IF;
3617
3618 IF pg_typeof(input) = 'AGG_TOKEN'::REGTYPE THEN
3619 IF input IS NULL OR k IS NULL THEN
3620 RETURN NULL;
3621 END IF;
3622 IF k < 0 THEN
3623 RAISE EXCEPTION 'central_moment(): k must be non-negative (got %)', k;
3624 END IF;
3625 IF k = 0 THEN RETURN 1; END IF;
3626 IF k = 1 THEN RETURN 0; END IF;
3627
3628 mu := agg_raw_moment(input::AGG_TOKEN, 1, prov, method, arguments);
3629 IF mu IS NULL THEN RETURN NULL; END IF;
3630 -- mu may be ±Infinity for empty MIN / MAX with positive empty
3631 -- probability; central_moment is undefined in that case.
3632 IF mu = 'Infinity'::float8 OR mu = '-Infinity'::float8 THEN
3633 RETURN mu;
3634 END IF;
3635
3636 total := 0;
3637 binom := 1; -- C(k, 0)
3638 k_double := k;
3639 FOR i IN 0..k LOOP
3640 raw_i := agg_raw_moment(input::AGG_TOKEN, i, prov, method, arguments);
3641 IF raw_i IS NULL THEN RETURN NULL; END IF;
3642 total := total + binom * power(-mu, k - i) * raw_i;
3643 -- C(k, i+1) = C(k, i) * (k - i) / (i + 1)
3644 IF i < k THEN
3645 binom := binom * (k_double - i) / (i + 1);
3646 END IF;
3647 END LOOP;
3648 RETURN total;
3649 END IF;
3650
3651 RAISE EXCEPTION 'central_moment() is not yet supported for input type %', pg_typeof(input);
3652END
3653$$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER;
3654
3655/**
3656 * @brief Compute the Shapley value of an input variable
3657 *
3658 * Measures the contribution of a specific input variable to the
3659 * truth of a provenance expression, using game-theoretic Shapley values.
3660 *
3661 * @param token provenance token to evaluate
3662 * @param variable UUID of the input variable
3663 * @param method knowledge compilation method
3664 * @param arguments additional arguments for the method
3665 * @param banzhaf if true, compute the Banzhaf value instead
3666 */
3667CREATE OR REPLACE FUNCTION shapley(
3668 token UUID,
3669 variable UUID,
3670 method TEXT = NULL,
3671 arguments TEXT = NULL,
3672 banzhaf BOOLEAN = 'f')
3673 RETURNS DOUBLE PRECISION AS
3674 'provsql','shapley' LANGUAGE C STABLE;
3675
3676/** @brief Compute Shapley values for all input variables at once */
3677CREATE OR REPLACE FUNCTION shapley_all_vars(
3678 IN token UUID,
3679 IN method TEXT = NULL,
3680 IN arguments TEXT = NULL,
3681 IN banzhaf BOOLEAN = 'f',
3682 OUT variable UUID,
3683 OUT value DOUBLE PRECISION)
3684 RETURNS SETOF RECORD AS
3685 'provsql', 'shapley_all_vars'
3686 LANGUAGE C STABLE;
3687
3688/** @brief Compute the Banzhaf power index of an input variable */
3689CREATE OR REPLACE FUNCTION banzhaf(
3690 token UUID,
3691 variable UUID,
3692 method TEXT = NULL,
3693 arguments TEXT = NULL)
3694 RETURNS DOUBLE PRECISION AS
3695 $$ SELECT provsql.shapley(token, variable, method, arguments, 't') $$
3696 LANGUAGE SQL;
3697
3698/** @brief Compute Banzhaf power indices for all input variables at once */
3699CREATE OR REPLACE FUNCTION banzhaf_all_vars(
3700 IN token UUID,
3701 IN method TEXT = NULL,
3702 IN arguments TEXT = NULL,
3703 OUT variable UUID,
3704 OUT value DOUBLE PRECISION)
3705 RETURNS SETOF RECORD AS
3706 $$ SELECT * FROM provsql.shapley_all_vars(token, method, arguments, 't') $$
3707 LANGUAGE SQL;
3708
3709/** @} */
3710
3711/** @defgroup provenance_output Provenance output
3712 * Functions for visualizing and exporting provenance circuits
3713 * in various formats.
3714 * @{
3715 */
3716
3717/**
3718 * @brief Return a DOT or TEXT visualization of the provenance circuit
3719 *
3720 * @param token root provenance token
3721 * @param token2desc mapping table for gate descriptions
3722 * @param dbg debug level (0 = normal)
3723 */
3724CREATE OR REPLACE FUNCTION view_circuit(
3725 token UUID,
3726 token2desc REGCLASS,
3727 dbg INT = 0)
3728 RETURNS TEXT AS
3729 'provsql','view_circuit' LANGUAGE C;
3730
3731/**
3732 * @brief Return an XML representation of the provenance circuit
3733 *
3734 * @param token root provenance token
3735 * @param token2desc optional mapping table for gate descriptions
3736 */
3737CREATE OR REPLACE FUNCTION to_provxml(
3738 token UUID,
3739 token2desc REGCLASS = NULL)
3740 RETURNS TEXT AS
3741 'provsql','to_provxml' LANGUAGE C;
3742
3743/** @brief Return the provenance token of the current query result tuple */
3744CREATE OR REPLACE FUNCTION provenance() RETURNS UUID AS
3745 'provsql', 'provenance' LANGUAGE C;
3746
3747/**
3748 * @brief Compute where-provenance for a result tuple
3749 *
3750 * Returns a TEXT representation showing which input columns
3751 * contributed to each output column.
3752 */
3753CREATE OR REPLACE FUNCTION where_provenance(token UUID)
3754 RETURNS TEXT AS
3755 'provsql','where_provenance' LANGUAGE C;
3756
3757/** @} */
3758
3759/** @defgroup circuit_init Circuit initialization
3760 * Functions and statements executed at extension load time to
3761 * reset internal caches and create the constant zero/one gates.
3762 * @{
3763 */
3764
3765/** @brief Reset the internal cache of OID constants used by the query rewriter */
3766CREATE OR REPLACE FUNCTION reset_constants_cache()
3767 RETURNS VOID AS
3768 'provsql', 'reset_constants_cache' LANGUAGE C;
3769
3770SELECT reset_constants_cache();
3771
3772SELECT create_gate(gate_zero(), 'zero');
3773SELECT create_gate(gate_one(), 'one');
3774
3775/** @} */
3776
3777/** @brief Types of update operations tracked for temporal provenance */
3778CREATE TYPE QUERY_TYPE_ENUM AS ENUM (
3779 'INSERT', -- Row was inserted
3780 'DELETE', -- Row was deleted
3781 'UPDATE', -- Row was updated
3782 'UNDO' -- Previous operation was undone
3783 );
3784
3785/** @defgroup compiled_semirings Compiled semirings
3786 * Definitions of compiled semirings
3787 * @{
3788 */
3789
3790/** @brief Evaluate provenance as a symbolic formula (e.g., "a ⊗ b ⊕ c") */
3791CREATE FUNCTION sr_formula(token ANYELEMENT, token2value REGCLASS)
3792 RETURNS VARCHAR AS
3793$$
3794BEGIN
3795 RETURN provsql.provenance_evaluate_compiled(
3796 token,
3797 token2value,
3798 'formula',
3799 '𝟙'::VARCHAR
3800 );
3801END
3802$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3803
3804/** @brief Evaluate provenance over the counting semiring (ℕ) */
3805CREATE FUNCTION sr_counting(token ANYELEMENT, token2value REGCLASS)
3806 RETURNS INT AS
3807$$
3808BEGIN
3809 RETURN provsql.provenance_evaluate_compiled(
3810 token,
3811 token2value,
3812 'counting',
3813 1
3814 );
3815END
3816$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3817
3818/** @brief Evaluate provenance as why-provenance (set of witness sets) */
3819CREATE FUNCTION sr_why(token ANYELEMENT, token2value REGCLASS)
3820 RETURNS VARCHAR AS
3821$$
3822BEGIN
3823 RETURN provsql.provenance_evaluate_compiled(
3824 token,
3825 token2value,
3826 'why',
3827 '{}'::VARCHAR
3828 );
3829END
3830$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3831
3832/** @brief Evaluate provenance as how-provenance (canonical polynomial provenance ℕ[X], universal commutative-semiring provenance) */
3833CREATE FUNCTION sr_how(token ANYELEMENT, token2value REGCLASS)
3834 RETURNS VARCHAR AS
3835$$
3836BEGIN
3837 RETURN provsql.provenance_evaluate_compiled(
3838 token,
3839 token2value,
3840 'how',
3841 '{}'::VARCHAR
3842 );
3843END
3844$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3845
3846/** @brief Evaluate provenance as which-provenance (lineage: a single set of contributing labels) */
3847CREATE FUNCTION sr_which(token ANYELEMENT, token2value REGCLASS)
3848 RETURNS VARCHAR AS
3849$$
3850BEGIN
3851 RETURN provsql.provenance_evaluate_compiled(
3852 token,
3853 token2value,
3854 'which',
3855 '{}'::VARCHAR
3856 );
3857END
3858$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3859
3860/** @brief Evaluate provenance as a Boolean expression
3861 *
3862 * The optional @p token2value mapping labels the leaves of the
3863 * formula: when omitted, leaves are rendered as bare @c x@<id@>
3864 * placeholders.
3865 */
3866CREATE FUNCTION sr_boolexpr(token ANYELEMENT, token2value REGCLASS = NULL)
3867 RETURNS VARCHAR AS
3868$$
3869BEGIN
3870 IF token IS NULL THEN
3871 RETURN NULL;
3872 END IF;
3873 RETURN provsql.provenance_evaluate_compiled(
3874 token,
3875 token2value,
3876 'boolexpr',
3877 '⊤'::VARCHAR
3878 );
3879END
3880$$ LANGUAGE plpgsql PARALLEL SAFE STABLE;
3881
3882/** @brief Evaluate provenance over the Boolean semiring (true/false) */
3883CREATE FUNCTION sr_boolean(token ANYELEMENT, token2value REGCLASS)
3884 RETURNS BOOLEAN AS
3885$$
3886BEGIN
3887 RETURN provsql.provenance_evaluate_compiled(
3888 token,
3889 token2value,
3890 'BOOLEAN',
3891 TRUE
3892 );
3893END
3894$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3895
3896/** @brief Evaluate provenance over the tropical (min-plus) m-semiring
3897 *
3898 * Inputs are read as %float8 cost values; the additive identity
3899 * is <tt>'Infinity'::%float8</tt> and the multiplicative identity is 0.
3900 * Returns the cost of the cheapest derivation.
3901 */
3902CREATE FUNCTION sr_tropical(token ANYELEMENT, token2value REGCLASS)
3903 RETURNS FLOAT AS
3904$$
3905BEGIN
3906 RETURN provsql.provenance_evaluate_compiled(
3907 token,
3908 token2value,
3909 'tropical',
3910 0::FLOAT
3911 );
3912END
3913$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3914
3915/** @brief Evaluate provenance over the Viterbi (max-times) m-semiring
3916 *
3917 * Inputs are read as %float8 probability values in @f$[0,1]@f$.
3918 * Returns the probability of the most likely derivation.
3919 */
3920CREATE FUNCTION sr_viterbi(token ANYELEMENT, token2value REGCLASS)
3921 RETURNS FLOAT AS
3922$$
3923BEGIN
3924 RETURN provsql.provenance_evaluate_compiled(
3925 token,
3926 token2value,
3927 'viterbi',
3928 1::FLOAT
3929 );
3930END
3931$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3932
3933/** @brief Evaluate provenance over the Łukasiewicz fuzzy m-semiring
3934 *
3935 * Inputs are read as %float8 graded-truth values in @f$[0,1]@f$.
3936 * Addition is @f$\max@f$; multiplication is the Łukasiewicz t-norm
3937 * @f$\max(a + b - 1, 0)@f$, which preserves crisp truth and avoids
3938 * the near-zero collapse of long product chains.
3939 */
3940CREATE FUNCTION sr_lukasiewicz(token ANYELEMENT, token2value REGCLASS)
3941 RETURNS FLOAT AS
3942$$
3943BEGIN
3944 RETURN provsql.provenance_evaluate_compiled(
3945 token,
3946 token2value,
3947 'lukasiewicz',
3948 1::FLOAT
3949 );
3950END
3951$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3952
3953/** @brief Evaluate provenance over the min-max m-semiring on a user ENUM
3954 *
3955 * Inputs are read as values of a user-defined ENUM carrier; addition
3956 * is ENUM-min, multiplication is ENUM-max. Bottom and top of the ENUM
3957 * are derived from @c pg_enum.enumsortorder. The third argument is a
3958 * sample value of the carrier ENUM, used only for type inference; its
3959 * value is ignored.
3960 *
3961 * The security shape: alternative derivations combine to the least
3962 * sensitive label, joins combine to the most sensitive label.
3963 *
3964 * @param token Provenance token to evaluate.
3965 * @param token2value Mapping from input gates to ENUM values.
3966 * @param element_one Sample value of the carrier ENUM (any value works).
3967 */
3968CREATE FUNCTION sr_minmax(token UUID, token2value REGCLASS, element_one ANYENUM)
3969 RETURNS ANYENUM AS
3970$$
3971BEGIN
3972 RETURN provsql.provenance_evaluate_compiled(
3973 token,
3974 token2value,
3975 'minmax',
3976 element_one
3977 );
3978END
3979$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
3980
3981/** @brief Evaluate provenance over the max-min m-semiring on a user ENUM
3982 *
3983 * Dual of :sqlfunc:`sr_minmax`: addition is ENUM-max, multiplication
3984 * is ENUM-min. The fuzzy / availability / trust shape: alternatives
3985 * combine to the most permissive label, joins combine to the strictest
3986 * label. The third argument is a sample value of the carrier ENUM,
3987 * used only for type inference; its value is ignored.
3988 *
3989 * @param token Provenance token to evaluate.
3990 * @param token2value Mapping from input gates to ENUM values.
3991 * @param element_one Sample value of the carrier ENUM (any value works).
3992 */
3993CREATE FUNCTION sr_maxmin(token UUID, token2value REGCLASS, element_one ANYENUM)
3994 RETURNS ANYENUM AS
3995$$
3996BEGIN
3997 RETURN provsql.provenance_evaluate_compiled(
3998 token,
3999 token2value,
4000 'maxmin',
4001 element_one
4002 );
4003END
4004$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
4005
4006/** @} */
4007
4008/** @defgroup choose_aggregate choose aggregate
4009 * Choose one value among many, used in particular to code a mutually
4010 * exclusive choice as an aggregate.
4011 * @{
4012 */
4013
4014/** @brief Transition function for the choose aggregate (keeps first non-NULL value) */
4015CREATE FUNCTION choose_function(state ANYELEMENT, data ANYELEMENT)
4016 RETURNS ANYELEMENT AS
4017$$
4018BEGIN
4019 IF state IS NULL THEN
4020 RETURN data;
4021 ELSE
4022 RETURN state;
4023 END IF;
4024END
4025$$ LANGUAGE plpgsql PARALLEL SAFE IMMUTABLE;
4026
4027/** @brief Aggregate that returns an arbitrary non-NULL value from a group */
4028CREATE AGGREGATE choose(ANYELEMENT) (
4029 SFUNC = choose_function,
4030 STYPE = ANYELEMENT
4031);
4032
4033/** @} */
4034
4035GRANT USAGE ON SCHEMA provsql TO PUBLIC;
4036
4037SET search_path TO public;
4038SET search_path TO provsql;
4039
4040/** @defgroup update_provenance Update provenance (PostgreSQL 14+)
4041 * Extended provenance tracking for INSERT, UPDATE, DELETE, and UNDO
4042 * operations, including temporal validity ranges.
4043 * @{
4044 */
4045
4046/**
4047 * @brief Table recording the history of INSERT, UPDATE, DELETE, and UNDO operations
4048 *
4049 * Each row records one provenance-tracked modification, linking the
4050 * operation's provenance token to metadata (query TEXT, type, user,
4051 * TIMESTAMP) and the temporal validity range of the affected rows.
4052 */
4053CREATE TABLE update_provenance (
4054 provsql UUID,
4055 query TEXT,
4056 query_type QUERY_TYPE_ENUM,
4057 username TEXT,
4058 ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
4059 valid_time TSTZMULTIRANGE DEFAULT TSTZMULTIRANGE(tstzrange(CURRENT_TIMESTAMP, NULL))
4060);
4061
4062/** @cond INTERNAL */
4063/* Enable provenance tracking on an existing table (PostgreSQL 14+ version).
4064 * Overrides the common version; documented via add_provenance in provsql.common.sql. */
4065CREATE OR REPLACE FUNCTION add_provenance(_tbl REGCLASS)
4066 RETURNS VOID AS
4067$$
4068BEGIN
4069 -- See the common-version body for the rationale of dropping the
4070 -- column DEFAULT and UNIQUE in favour of provenance_guard + a
4071 -- plain index.
4072 EXECUTE format('ALTER TABLE %s ADD COLUMN provsql UUID', _tbl);
4073 EXECUTE format(
4074 'UPDATE %s SET provsql = public.uuid_generate_v4() WHERE provsql IS NULL',
4075 _tbl);
4076 EXECUTE format('CREATE INDEX ON %s(provsql)', _tbl);
4077 EXECUTE format(
4078 'CREATE TRIGGER provenance_guard BEFORE INSERT OR UPDATE OF provsql '
4079 'ON %s FOR EACH ROW EXECUTE PROCEDURE provsql.provenance_guard()',
4080 _tbl);
4081
4082 EXECUTE format('CREATE TRIGGER insert_statement AFTER INSERT ON %s REFERENCING NEW TABLE AS NEW_TABLE FOR EACH STATEMENT EXECUTE PROCEDURE provsql.insert_statement_trigger()', _tbl);
4083 EXECUTE format('CREATE TRIGGER delete_statement AFTER DELETE ON %s REFERENCING OLD TABLE AS OLD_TABLE FOR EACH STATEMENT EXECUTE PROCEDURE provsql.delete_statement_trigger()', _tbl);
4084 EXECUTE format('CREATE TRIGGER update_statement AFTER UPDATE ON %s REFERENCING OLD TABLE AS OLD_TABLE NEW TABLE AS NEW_TABLE FOR EACH STATEMENT EXECUTE PROCEDURE provsql.update_statement_trigger()', _tbl);
4085
4086 PERFORM provsql.set_table_info(_tbl::oid, 'tid');
4087 PERFORM provsql.set_ancestors(_tbl::oid, ARRAY[_tbl::oid]);
4088END
4089$$ LANGUAGE plpgsql SECURITY DEFINER;
4090/** @endcond */
4091
4092/** @cond INTERNAL */
4093/* Trigger function for DELETE statement provenance tracking (PostgreSQL 14+).
4094 * Overrides the common version; documented via delete_statement_trigger in provsql.common.sql. */
4095CREATE OR REPLACE FUNCTION delete_statement_trigger()
4096 RETURNS TRIGGER AS
4097$$
4098DECLARE
4099 query_text TEXT;
4100 delete_token UUID;
4101 old_token UUID;
4102 new_token UUID;
4103 r RECORD;
4104 enable_trigger BOOL;
4105BEGIN
4106 enable_trigger := current_setting('provsql.update_provenance', true);
4107 IF enable_trigger = 'f' THEN
4108 RETURN NULL;
4109 END IF;
4110 delete_token := public.uuid_generate_v4();
4111
4112 PERFORM create_gate(delete_token, 'update');
4113
4114 SELECT query
4115 INTO query_text
4116 FROM pg_stat_activity
4117 WHERE pid = pg_backend_pid();
4118
4119 INSERT INTO update_provenance (provsql, query, query_type, username, ts, valid_time)
4120 VALUES (delete_token, query_text, 'DELETE', current_user, CURRENT_TIMESTAMP, TSTZMULTIRANGE(tstzrange(CURRENT_TIMESTAMP, NULL)));
4121
4122 PERFORM set_config('provsql.update_provenance', 'off', false);
4123 EXECUTE format('INSERT INTO %I.%I SELECT * FROM OLD_TABLE;', TG_TABLE_SCHEMA, TG_TABLE_NAME);
4124 PERFORM set_config('provsql.update_provenance', 'on', false);
4125
4126 FOR r IN (SELECT * FROM OLD_TABLE) LOOP
4127 old_token := r.provsql;
4128 new_token := provenance_monus(old_token, delete_token);
4129
4130 PERFORM set_config('provsql.update_provenance', 'off', false);
4131 EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2;', TG_TABLE_SCHEMA, TG_TABLE_NAME)
4132 USING new_token, old_token;
4133 PERFORM set_config('provsql.update_provenance', 'on', false);
4134 END LOOP;
4135
4136 RETURN NULL;
4137END
4138$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp SECURITY DEFINER;
4139/** @endcond */
4140
4141/**
4142 * @brief Trigger function for INSERT statement provenance tracking
4143 *
4144 * Records the insertion in update_provenance and multiplies provenance
4145 * tokens of inserted rows with the insert token.
4146 */
4147CREATE OR REPLACE FUNCTION insert_statement_trigger()
4148 RETURNS TRIGGER AS
4149$$
4150DECLARE
4151 query_text TEXT;
4152 insert_token UUID;
4153 old_token UUID;
4154 new_token UUID;
4155 r RECORD;
4156 enable_trigger BOOL;
4157BEGIN
4158 enable_trigger := current_setting('provsql.update_provenance', true);
4159 IF enable_trigger = 'f' THEN
4160 RETURN NULL;
4161 END IF;
4162
4163 insert_token := public.uuid_generate_v4();
4164
4165 PERFORM create_gate(insert_token, 'update');
4166
4167 SELECT query
4168 INTO query_text
4169 FROM pg_stat_activity
4170 WHERE pid = pg_backend_pid();
4171
4172 INSERT INTO update_provenance (provsql, query, query_type, username, ts, valid_time)
4173 VALUES (insert_token, query_text, 'INSERT', current_user, CURRENT_TIMESTAMP, TSTZMULTIRANGE(tstzrange(CURRENT_TIMESTAMP, NULL)));
4174
4175 FOR r IN (SELECT * FROM NEW_TABLE) LOOP
4176 old_token := r.provsql;
4177 new_token := provenance_times(old_token, insert_token);
4178 PERFORM set_config('provsql.update_provenance', 'off', false);
4179 EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2;', TG_TABLE_SCHEMA, TG_TABLE_NAME)
4180 USING new_token, old_token;
4181 PERFORM set_config('provsql.update_provenance', 'on', false);
4182 END LOOP;
4183
4184 RETURN NULL;
4185END
4186$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp SECURITY DEFINER;
4187
4188/**
4189 * @brief Trigger function for UPDATE statement provenance tracking
4190 *
4191 * Records the update in update_provenance. Multiplies new-row tokens
4192 * with the update token and applies monus to old-row tokens.
4193 */
4194CREATE OR REPLACE FUNCTION update_statement_trigger()
4195 RETURNS TRIGGER AS
4196$$
4197DECLARE
4198 query_text TEXT;
4199 update_token UUID;
4200 old_token UUID;
4201 new_token UUID;
4202 r RECORD;
4203 enable_trigger BOOL;
4204BEGIN
4205 enable_trigger := current_setting('provsql.update_provenance', true);
4206 IF enable_trigger = 'f' THEN
4207 RETURN NULL;
4208 END IF;
4209 update_token := public.uuid_generate_v4();
4210
4211 PERFORM create_gate(update_token, 'update');
4212
4213 SELECT query
4214 INTO query_text
4215 FROM pg_stat_activity
4216 WHERE pid = pg_backend_pid();
4217
4218 INSERT INTO update_provenance (provsql, query, query_type, username, ts, valid_time)
4219 VALUES (update_token, query_text, 'UPDATE', current_user, CURRENT_TIMESTAMP, TSTZMULTIRANGE(tstzrange(CURRENT_TIMESTAMP, NULL)));
4220
4221 FOR r IN (SELECT * FROM NEW_TABLE) LOOP
4222 old_token := r.provsql;
4223 new_token := provenance_times(old_token, update_token);
4224
4225 PERFORM set_config('provsql.update_provenance', 'off', false);
4226 EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2;', TG_TABLE_SCHEMA, TG_TABLE_NAME)
4227 USING new_token, old_token;
4228 PERFORM set_config('provsql.update_provenance', 'on', false);
4229 END LOOP;
4230
4231 PERFORM set_config('provsql.update_provenance', 'off', false);
4232 EXECUTE format('INSERT INTO %I.%I SELECT * FROM OLD_TABLE;', TG_TABLE_SCHEMA, TG_TABLE_NAME);
4233 PERFORM set_config('provsql.update_provenance', 'on', false);
4234
4235 FOR r IN (SELECT * FROM OLD_TABLE) LOOP
4236 old_token := r.provsql;
4237 new_token := provenance_monus(old_token, update_token);
4238
4239 PERFORM set_config('provsql.update_provenance', 'off', false);
4240 EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2;', TG_TABLE_SCHEMA, TG_TABLE_NAME)
4241 USING new_token, old_token;
4242 PERFORM set_config('provsql.update_provenance', 'on', false);
4243 END LOOP;
4244
4245 RETURN NULL;
4246END
4247$$ LANGUAGE plpgsql SET search_path=provsql,pg_temp SECURITY DEFINER;
4248
4249
4250/** @} */
4251
4252/** @defgroup temporal_db Temporal DB (PostgreSQL 14+)
4253 * Functions for temporal database support. These use provenance
4254 * evaluation over the multirange semiring to track temporal validity
4255 * of tuples.
4256 * @{
4257 */
4258
4259SET search_path TO provsql;
4260
4261/**
4262 * @brief Evaluate provenance over the temporal (interval-union) m-semiring
4263 *
4264 * Inputs are read as %TSTZMULTIRANGE validity intervals; the additive
4265 * identity is <tt>'{}'::%TSTZMULTIRANGE</tt> (empty), the multiplicative
4266 * identity is <tt>'{(,)}'::%TSTZMULTIRANGE</tt> (universal). Returns the union
4267 * of intervals supporting the result, computed via the compiled circuit
4268 * traversal.
4269 *
4270 * @param token Provenance token to evaluate.
4271 * @param token2value Mapping from input gates to validity multiranges.
4272 */
4273CREATE FUNCTION sr_temporal(token ANYELEMENT, token2value REGCLASS)
4274 RETURNS TSTZMULTIRANGE AS
4275$$
4276BEGIN
4277 RETURN provsql.provenance_evaluate_compiled(
4278 token,
4279 token2value,
4280 'interval_union',
4281 '{(,)}'::TSTZMULTIRANGE
4282 );
4283END
4284$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
4285
4286/**
4287 * @brief Evaluate provenance over the interval-union m-semiring
4288 * with a NUMERIC multirange carrier
4289 *
4290 * Inputs are read as %nummultirange validity ranges over a NUMERIC
4291 * domain (e.g. sensor measurement-validity ranges). Addition is
4292 * multirange union, multiplication is intersection, monus is set
4293 * difference; the additive identity is <tt>'{}'::%nummultirange</tt>
4294 * and the multiplicative identity is <tt>'{(,)}'::%nummultirange</tt>
4295 * (universal range).
4296 *
4297 * @param token Provenance token to evaluate.
4298 * @param token2value Mapping from input gates to NUMERIC multiranges.
4299 */
4300CREATE FUNCTION sr_interval_num(token ANYELEMENT, token2value REGCLASS)
4301 RETURNS nummultirange AS
4302$$
4303BEGIN
4304 RETURN provsql.provenance_evaluate_compiled(
4305 token,
4306 token2value,
4307 'interval_union',
4308 '{(,)}'::nummultirange
4309 );
4310END
4311$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
4312
4313/**
4314 * @brief Evaluate provenance over the interval-union m-semiring
4315 * with an int4 multirange carrier
4316 *
4317 * Inputs are read as %int4multirange validity ranges over the
4318 * integers (e.g. page or line ranges of supporting documents).
4319 * Addition is multirange union, multiplication is intersection,
4320 * monus is set difference; the additive identity is
4321 * <tt>'{}'::%int4multirange</tt> and the multiplicative identity is
4322 * <tt>'{(,)}'::%int4multirange</tt>.
4323 *
4324 * @param token Provenance token to evaluate.
4325 * @param token2value Mapping from input gates to int4 multiranges.
4326 */
4327CREATE FUNCTION sr_interval_int(token ANYELEMENT, token2value REGCLASS)
4328 RETURNS int4multirange AS
4329$$
4330BEGIN
4331 RETURN provsql.provenance_evaluate_compiled(
4332 token,
4333 token2value,
4334 'interval_union',
4335 '{(,)}'::int4multirange
4336 );
4337END
4338$$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE;
4339
4340/**
4341 * @brief Evaluate temporal provenance as a TIMESTAMP multirange
4342 *
4343 * Thin wrapper around :sqlfunc:`sr_temporal` retained for backward
4344 * compatibility; both compute the same union of validity intervals.
4345 *
4346 * @param token provenance token to evaluate
4347 * @param token2value mapping table from tokens to temporal validity ranges
4348 */
4349CREATE OR REPLACE FUNCTION union_tstzintervals(
4350 token UUID,
4351 token2value REGCLASS
4352)
4353RETURNS TSTZMULTIRANGE AS
4354$$
4355 SELECT sr_temporal(token, token2value)
4356$$ LANGUAGE SQL PARALLEL SAFE STABLE;
4357
4358/**
4359 * @brief Query a table as it was at a specific point in time
4360 *
4361 * Returns all rows whose temporal validity includes the given TIMESTAMP.
4362 *
4363 * @param tablename name of the provenance-tracked table
4364 * @param at_time the point in time to query
4365 */
4366CREATE OR REPLACE FUNCTION timetravel(
4367 tablename TEXT,
4368 at_time TIMESTAMPTZ
4369)
4370RETURNS SETOF RECORD
4371LANGUAGE plpgsql
4372AS
4373$$
4374BEGIN
4375 RETURN QUERY EXECUTE format(
4376 '
4377 SELECT
4378 %1$I.*,
4379 sr_temporal(provenance(), %2$L)
4380 FROM
4381 %1$I
4382 WHERE
4383 sr_temporal(provenance(), %2$L) @> %3$L::TIMESTAMPTZ
4384 ',
4385 tablename,
4386 'provsql.time_validity_view',
4387 at_time::TEXT
4388 );
4389END;
4390$$;
4391
4392/**
4393 * @brief Query a table for rows valid during a time interval
4394 *
4395 * Returns all rows whose temporal validity overlaps the given range.
4396 *
4397 * @param tablename name of the provenance-tracked table
4398 * @param from_time start of the time interval
4399 * @param to_time end of the time interval
4400 */
4401CREATE OR REPLACE FUNCTION timeslice(
4402 tablename TEXT,
4403 from_time TIMESTAMPTZ,
4404 to_time TIMESTAMPTZ
4405)
4406RETURNS SETOF RECORD
4407LANGUAGE plpgsql
4408AS
4409$$
4410BEGIN
4411 RETURN QUERY EXECUTE format(
4412 '
4413 SELECT
4414 %1$I.*,
4415 sr_temporal(provenance(), %2$L)
4416 FROM
4417 %1$I
4418 WHERE
4419 sr_temporal(provenance(), %2$L)
4420 && tstzrange(%3$L::TIMESTAMPTZ, %4$L::TIMESTAMPTZ)
4421 ',
4422 tablename,
4423 'provsql.time_validity_view',
4424 from_time::TEXT,
4425 to_time::TEXT
4426 );
4427END;
4428$$;
4429
4430/**
4431 * @brief Query the full temporal history of specific rows
4432 *
4433 * Returns all versions of rows matching the given column values,
4434 * with their temporal validity ranges.
4435 *
4436 * @param tablename name of the provenance-tracked table
4437 * @param col_names array of column names to filter on
4438 * @param col_values array of corresponding values to match
4439 */
4440CREATE OR REPLACE FUNCTION history(
4441 tablename TEXT,
4442 col_names TEXT[],
4443 col_values TEXT[]
4444)
4445RETURNS SETOF RECORD
4446LANGUAGE plpgsql
4447AS
4448$$
4449DECLARE
4450 condition TEXT := '';
4451 i INT;
4452BEGIN
4453 IF array_length(col_names, 1) IS NULL
4454 OR array_length(col_values, 1) IS NULL
4455 OR array_length(col_names, 1) != array_length(col_values, 1)
4456 THEN
4457 RAISE EXCEPTION 'col_names and col_values must have the same (non-null) length';
4458 END IF;
4459
4460 FOR i IN 1..array_length(col_names, 1)
4461 LOOP
4462 IF i > 1 THEN
4463 condition := condition || ' AND ';
4464 END IF;
4465 condition := condition || format('%I = %L', col_names[i], col_values[i]);
4466 END LOOP;
4467
4468 RETURN QUERY EXECUTE format(
4469 '
4470 SELECT
4471 %I.*,
4472 sr_temporal(provenance(), %L)
4473 FROM
4474 %I
4475 WHERE
4476 %s
4477 ',
4478 tablename,
4479 'provsql.time_validity_view',
4480 tablename,
4481 condition
4482 );
4483END;
4484$$;
4485
4486/**
4487 * @brief Get the valid time range for a specific tuple
4488 *
4489 * @param token provenance token of the tuple
4490 * @param tablename name of the table containing the tuple
4491 */
4492CREATE OR REPLACE FUNCTION get_valid_time(
4493 token UUID,
4494 tablename TEXT
4495)
4496RETURNS TSTZMULTIRANGE
4497LANGUAGE plpgsql
4498AS $$
4499DECLARE
4500 result TSTZMULTIRANGE;
4501BEGIN
4502 EXECUTE format(
4503 '
4504 SELECT
4505 sr_temporal(provenance(), %L)
4506 FROM
4507 %I
4508 WHERE
4509 provsql = %L
4510 ',
4511 'provsql.time_validity_view',
4512 tablename,
4513 token
4514 )
4515 INTO result;
4516
4517 RETURN result;
4518END;
4519$$;
4520
4521/**
4522 * @brief Undo a previously recorded update operation
4523 *
4524 * Traverses all provenance-tracked tables and rewrites their circuits
4525 * to apply monus with respect to the given update token, effectively
4526 * undoing the operation.
4527 *
4528 * @param c UUID of the update operation to undo (from update_provenance)
4529 */
4530CREATE OR REPLACE FUNCTION undo(
4531 c UUID
4532)
4533RETURNS UUID
4534LANGUAGE plpgsql
4535AS $$
4536DECLARE
4537 undo_query TEXT;
4538 undone_query TEXT;
4539 undo_token UUID;
4540 schema_rec RECORD;
4541 table_rec RECORD;
4542 row_rec RECORD;
4543 new_x UUID;
4544BEGIN
4545 SELECT query INTO undone_query
4546 FROM update_provenance
4547 WHERE provsql = c
4548 LIMIT 1;
4549
4550 IF undone_query IS NULL THEN
4551 RAISE NOTICE 'Unable to find % in update_provenance', c;
4552 RETURN c;
4553 END IF;
4554
4555 SELECT query
4556 INTO undo_query
4557 FROM pg_stat_activity
4558 WHERE pid = pg_backend_pid();
4559
4560 undo_token := public.uuid_generate_v4();
4561 PERFORM create_gate(undo_token, 'update');
4562 INSERT INTO update_provenance(provsql, query, query_type, username, ts, valid_time)
4563 VALUES (
4564 undo_token,
4565 undo_query,
4566 'UNDO',
4567 current_user,
4568 CURRENT_TIMESTAMP,
4569 TSTZMULTIRANGE(tstzrange(CURRENT_TIMESTAMP, NULL))
4570 );
4571
4572 PERFORM set_config('provsql.update_provenance', 'off', false);
4573
4574 FOR schema_rec IN
4575 SELECT nspname
4576 FROM pg_namespace
4577 WHERE nspname NOT IN ('pg_catalog','information_schema','pg_toast','pg_temp_1','pg_toast_temp_1')
4578 LOOP
4579 FOR table_rec IN
4580 EXECUTE format('SELECT tablename AS tname FROM pg_tables WHERE schemaname = %L', schema_rec.nspname)
4581 LOOP
4582 IF EXISTS (
4583 SELECT 1
4584 FROM information_schema.columns
4585 WHERE table_schema = schema_rec.nspname
4586 AND table_name = table_rec.tname
4587 AND table_name <> 'update_provenance'
4588 AND column_name = 'provsql'
4589 ) THEN
4590 FOR row_rec IN
4591 EXECUTE format('SELECT provsql AS x FROM %I.%I', schema_rec.nspname, table_rec.tname)
4592 LOOP
4593 new_x := replace_the_circuit(row_rec.x, c, undo_token);
4594 EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2',
4595 schema_rec.nspname, table_rec.tname)
4596 USING new_x, row_rec.x;
4597 END LOOP;
4598 END IF;
4599 END LOOP;
4600 END LOOP;
4601
4602 PERFORM set_config('provsql.update_provenance', 'on', false);
4603
4604 RETURN undo_token;
4605END;
4606$$;
4607
4608/**
4609 * @brief Recursively rewrite a circuit to undo a specific operation
4610 *
4611 * Helper for undo(). Walks the circuit and replaces occurrences of
4612 * the target update gate with its monus.
4613 *
4614 * @param x provenance token to rewrite
4615 * @param c UUID of the update operation to undo
4616 * @param u UUID of the undo operation
4617 */
4618CREATE OR REPLACE FUNCTION replace_the_circuit(
4619 x UUID,
4620 c UUID,
4621 u UUID
4622)
4623RETURNS UUID
4624LANGUAGE plpgsql
4625AS $$
4626DECLARE
4627 nchildren UUID[];
4628 child UUID;
4629 ntoken UUID;
4630 ntype PROVENANCE_GATE;
4631BEGIN
4632 IF x = c THEN
4633 RETURN provenance_monus(c, u);
4634 -- update and input gates cannot have children
4635 ELSIF get_gate_type(x) = 'update' OR get_gate_type(x) = 'input' THEN
4636 RETURN x;
4637 ELSE
4638 nchildren := '{}';
4639 FOREACH child IN ARRAY get_children(x)
4640 LOOP
4641 nchildren := array_append(nchildren, replace_the_circuit(child, c, u));
4642 END LOOP;
4643
4644 ntoken := public.uuid_generate_v4();
4645 ntype := get_gate_type(x);
4646
4647 PERFORM create_gate(ntoken, ntype, nchildren);
4648 RETURN ntoken;
4649 END IF;
4650END;
4651$$;
4652
4653SELECT create_provenance_mapping_view('time_validity_view', 'update_provenance', 'valid_time');
4654
4655/** @} */
4656
4657SET search_path TO public;