ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
provsql.c
Go to the documentation of this file.
1/**
2 * @file provsql.c
3 * @brief PostgreSQL planner hook for transparent provenance tracking.
4 *
5 * This file installs a @c planner_hook that intercepts every SELECT query
6 * and rewrites it to propagate a provenance circuit token (UUID) alongside
7 * normal result tuples. The rewriting proceeds in three conceptual phases:
8 *
9 * -# **Discovery** – scan the range table for relations/subqueries that
10 * already carry a @c provsql UUID column (@c get_provenance_attributes).
11 * -# **Expression building** – combine the discovered tokens according
12 * to the semiring operation that corresponds to the SQL operator in use
13 * (⊗ for joins, ⊕ for duplicate elimination, ⊖ for EXCEPT) and wrap
14 * aggregations (@c make_provenance_expression,
15 * @c make_aggregation_expression).
16 * -# **Splice** – append the resulting provenance expression to the target
17 * list and replace any explicit @c provenance() call in the query with
18 * the computed expression (@c add_to_select,
19 * @c replace_provenance_function_by_expression).
20 */
21#include "postgres.h"
22#include "fmgr.h"
23#include "miscadmin.h"
24#include "pg_config.h"
25#include "access/htup_details.h"
26#include "access/sysattr.h"
27#include "catalog/pg_aggregate.h"
28#include "catalog/pg_class.h" /* RELKIND_VIEW */
29#include "catalog/pg_collation.h"
30#include "catalog/pg_operator.h"
31#include "catalog/pg_proc.h"
32#include "catalog/pg_type.h"
33#include "nodes/makefuncs.h"
34#include "nodes/nodeFuncs.h"
35#include "nodes/print.h"
36#include "executor/executor.h"
37#include "optimizer/planner.h"
38#include "parser/parse_coerce.h"
39#include "parser/parse_oper.h"
40#include "utils/builtins.h"
41#include "parser/parsetree.h"
42#include "storage/lwlock.h"
43#include "storage/shmem.h"
44#include "utils/fmgroids.h"
45#include "utils/guc.h"
46#include "utils/lsyscache.h"
47#include "utils/ruleutils.h"
48#include "utils/syscache.h"
49#include "catalog/namespace.h"
50#include "catalog/pg_cast.h"
51#include "commands/createas.h"
52#include "executor/spi.h"
53#include "tcop/utility.h"
54#include <time.h>
55
56#include "classify_query.h"
57#include "provsql_mmap.h"
58#include "provsql_shmem.h"
59#include "provsql_utils.h"
60#include "safe_query.h"
61
62#if PG_VERSION_NUM < 100000
63#error "ProvSQL requires PostgreSQL version 10 or later"
64#endif
65
66#include "compatibility.h"
67
68PG_MODULE_MAGIC; ///< Required PostgreSQL extension magic block
69
70/* -------------------------------------------------------------------------
71 * Global state & forward declarations
72 * ------------------------------------------------------------------------- */
73
75static bool provsql_active = true; ///< @c true while ProvSQL query rewriting is enabled
77static bool provsql_update_provenance = false; ///< @c true when provenance tracking for DML is enabled
78int provsql_verbose = 100; ///< Verbosity level; controlled by the @c provsql.verbose_level GUC
79bool provsql_aggtoken_text_as_uuid = false; ///< When @c true, @c agg_token::text emits the underlying provenance UUID instead of @c "value (*)"
80char *provsql_tool_search_path = NULL; ///< Colon-separated directory list prepended to @c PATH when invoking external tools (d4, c2d, minic2d, dsharp, weightmc, graph-easy); controlled by the @c provsql.tool_search_path GUC
81int provsql_monte_carlo_seed = -1; ///< Seed for the Monte Carlo sampler; -1 means non-deterministic (std::random_device); controlled by the @c provsql.monte_carlo_seed GUC
82int provsql_rv_mc_samples = 10000; ///< Default sample count for analytical-evaluator MC fallbacks; 0 disables fallback (callers raise instead); controlled by the @c provsql.rv_mc_samples GUC
83bool provsql_simplify_on_load = true; ///< Run universal cmp-resolution passes when @c getGenericCircuit returns; controlled by the @c provsql.simplify_on_load GUC
84bool provsql_hybrid_evaluation = true; ///< Run the hybrid-evaluator simplifier inside @c probability_evaluate; controlled by the @c provsql.hybrid_evaluation GUC
85bool provsql_cmp_probability_evaluation = true; ///< Run closed-form / analytic probability evaluators for @c gate_cmps inside @c probability_evaluate (currently the Poisson-binomial pre-pass for HAVING-COUNT; future MIN / MAX / SUM evaluators will gate on the same GUC); controlled by the @c provsql.cmp_probability_evaluation GUC
86bool provsql_boolean_provenance = false; ///< Opt-in safe-query optimisation: when @c true, rewrites hierarchical conjunctive queries to a read-once form whose probability is computable in linear time. The resulting circuit is tagged so that semiring evaluations admitting no homomorphism from Boolean functions refuse to run on it. Controlled by the @c provsql.boolean_provenance GUC.
87
88
89extern void _PG_init(void);
90extern void _PG_fini(void);
91
92static planner_hook_type prev_planner = NULL; ///< Previous planner hook (chained)
93
94static Query *process_query(const constants_t *constants, Query *q,
95 bool **removed, bool wrap_root);
96static Expr *wrap_in_assume_boolean(const constants_t *constants, Expr *expr);
97
98/* -------------------------------------------------------------------------
99 * Provenance attribute construction
100 * ------------------------------------------------------------------------- */
101
102/**
103 * @brief Build a Var node that references the provenance column of a relation.
104 *
105 * Creates a @c Var pointing to attribute @p attid of range-table entry
106 * @p relid, typed as UUID, and marks the column as selected in the
107 * permission bitmap so PostgreSQL grants access correctly.
108 *
109 * @param constants Extension OID cache.
110 * @param q Owning query (needed to update permission info on PG 16+).
111 * @param r Range-table entry that owns the provenance column.
112 * @param relid 1-based index of @p r in @p q->rtable.
113 * @param attid 1-based attribute number of the provenance column in @p r.
114 * @return A freshly allocated @c Var node.
115 */
116static Var *make_provenance_attribute(const constants_t *constants, Query *q,
117 RangeTblEntry *r, Index relid,
118 AttrNumber attid) {
119 Var *v = makeNode(Var);
120
121 v->varno = relid;
122 v->varattno = attid;
123
124#if PG_VERSION_NUM >= 130000
125 v->varnosyn = relid;
126 v->varattnosyn = attid;
127#else
128 v->varnoold = relid;
129 v->varoattno = attid;
130#endif
131
132 v->vartype = constants->OID_TYPE_UUID;
133 v->varcollid = InvalidOid;
134 v->vartypmod = -1;
135 v->location = -1;
136
137#if PG_VERSION_NUM >= 160000
138 if (r->perminfoindex != 0) {
139 RTEPermissionInfo *rpi =
140 list_nth_node(RTEPermissionInfo, q->rteperminfos, r->perminfoindex - 1);
141 rpi->selectedCols = bms_add_member(
142 rpi->selectedCols, attid - FirstLowInvalidHeapAttributeNumber);
143 }
144#else
145 r->selectedCols = bms_add_member(r->selectedCols,
146 attid - FirstLowInvalidHeapAttributeNumber);
147#endif
148
149 return v;
150}
151
152/* -------------------------------------------------------------------------
153 * Helper mutators: attribute-number fixup and type patching
154 * ------------------------------------------------------------------------- */
155
156/** @brief Context for the @c reduce_varattno_mutator tree walker. */
158 Index varno; ///< Range-table entry whose attribute numbers are being adjusted
159 int *offset; ///< Per-attribute cumulative shift to apply
161
162/**
163 * @brief Tree-mutator callback that adjusts Var attribute numbers.
164 * @param node Current expression tree node.
165 * @param ctx Pointer to a @c reduce_varattno_mutator_context.
166 * @return Possibly modified node.
167 */
168static Node *reduce_varattno_mutator(Node *node, void *ctx) {
170 if (node == NULL)
171 return NULL;
172
173 if (IsA(node, Var)) {
174 Var *v = (Var *)node;
175
176 if (v->varno == context->varno) {
177 v->varattno += context->offset[v->varattno - 1];
178 }
179 }
180
181 return expression_tree_mutator(node, reduce_varattno_mutator, ctx);
182}
183
184/**
185 * @brief Adjust Var attribute numbers in @p targetList after columns are removed.
186 *
187 * When provenance columns are stripped from a subquery's target list, the
188 * remaining columns shift left. This function applies a pre-computed
189 * @p offset array (one entry per original column) to correct all @c Var
190 * nodes that reference range-table entry @p varno.
191 *
192 * @param targetList Target list of the outer query to patch.
193 * @param varno Range-table entry whose attribute numbers need fixing.
194 * @param offset Cumulative shift per original attribute (negative or zero).
195 */
196static void reduce_varattno_by_offset(List *targetList, Index varno,
197 int *offset) {
198 ListCell *lc;
199 reduce_varattno_mutator_context context = {varno, offset};
200
201 foreach (lc, targetList) {
202 Node *te = lfirst(lc);
203 expression_tree_mutator(te, reduce_varattno_mutator, &context);
204 }
205}
206
207/** @brief Context for the @c aggregation_type_mutator tree walker. */
209 Index varno; ///< Range-table entry index of the aggregate var
210 Index varattno; ///< Attribute number of the aggregate column
211 const constants_t *constants; ///< Extension OID cache
213
214/**
215 * @brief Check if a Var matches the target aggregate column.
216 */
217static bool is_target_agg_var(Node *node,
219 if (IsA(node, Var)) {
220 Var *v = (Var *)node;
221 return v->varno == context->varno && v->varattno == context->varattno;
222 }
223 return false;
224}
225
226/**
227 * @brief Tree-mutator that retypes a specific Var to @c agg_token.
228 *
229 * When the target Var is inside a cast FuncExpr, replaces the cast
230 * function with the equivalent agg_token→target cast from pg_cast.
231 * When the Var appears bare (e.g. in a TargetEntry for display), it is
232 * retyped to agg_token directly. In all other contexts (arithmetic,
233 * window functions, etc.), wraps the Var in an explicit agg_token→original
234 * cast so that parent nodes receive the expected type.
235 *
236 * @param node Current expression tree node.
237 * @param ctx Pointer to an @c aggregation_type_mutator_context (varno,
238 * varattno, and constants).
239 * @return Possibly modified node.
240 */
241static Node *
242aggregation_type_mutator(Node *node, void *ctx) {
244 if (node == NULL)
245 return NULL;
246
247 if (IsA(node, FuncExpr)) {
248 FuncExpr *f = (FuncExpr *)node;
249
250 /* Check if this is a cast wrapping our target Var */
251 if (list_length(f->args) == 1 &&
252 is_target_agg_var(linitial(f->args), context)) {
253 /* Look up the cast from agg_token to the target type */
254 HeapTuple castTuple = SearchSysCache2(CASTSOURCETARGET,
255 ObjectIdGetDatum(context->constants->OID_TYPE_AGG_TOKEN),
256 ObjectIdGetDatum(f->funcresulttype));
257
258 if (HeapTupleIsValid(castTuple)) {
259 Form_pg_cast castForm = (Form_pg_cast) GETSTRUCT(castTuple);
260 if (OidIsValid(castForm->castfunc)) {
261 f->funcid = castForm->castfunc;
262 }
263 ReleaseSysCache(castTuple);
264 }
265
266 /* Retype the Var inside */
267 ((Var *)linitial(f->args))->vartype =
269
270 return (Node *)f;
271 }
272 }
273
274 if (IsA(node, Var)) {
275 Var *v = (Var *)node;
276
277 if (v->varno == context->varno && v->varattno == context->varattno) {
278 v->vartype = context->constants->OID_TYPE_AGG_TOKEN;
279 }
280 }
281 return expression_tree_mutator(node, aggregation_type_mutator, ctx);
282}
283
284/**
285 * @brief Retypes aggregation-result Vars in @p q from UUID to @c agg_token.
286 *
287 * After a subquery that contains @c provenance_aggregate is processed, its
288 * result type is @c agg_token rather than plain UUID. This mutator walks
289 * the outer query and updates the type of every @c Var referencing that
290 * result column so that subsequent type-checking passes correctly.
291 *
292 * @param constants Extension OID cache.
293 * @param q Outer query to patch.
294 * @param rteid Range-table index of the subquery in @p q.
295 * @param targetList Target list of the subquery (to locate provenance_aggregate columns).
296 */
297static void fix_type_of_aggregation_result(const constants_t *constants,
298 Query *q, Index rteid,
299 List *targetList) {
300 ListCell *lc;
301 aggregation_type_mutator_context context = {0, 0, constants};
302 Index attno = 1;
303
304 foreach (lc, targetList) {
305 TargetEntry *te = (TargetEntry *)lfirst(lc);
306 if (IsA(te->expr, FuncExpr)) {
307 FuncExpr *f = (FuncExpr *)te->expr;
308
309 if (f->funcid == constants->OID_FUNCTION_PROVENANCE_AGGREGATE) {
310 context.varno = rteid;
311 context.varattno = attno;
312 query_tree_mutator(q, aggregation_type_mutator, &context,
313 QTW_DONT_COPY_QUERY | QTW_IGNORE_RC_SUBQUERIES);
314
315 /* Check if the retyped column is used in ORDER BY or GROUP BY */
316 {
317 ListCell *lc2;
318 foreach (lc2, q->targetList) {
319 TargetEntry *outer_te = (TargetEntry *)lfirst(lc2);
320 if (IsA(outer_te->expr, Var)) {
321 Var *v = (Var *)outer_te->expr;
322 if (v->varno == rteid && v->varattno == attno &&
323 outer_te->ressortgroupref > 0) {
324 ListCell *lc3;
325 foreach (lc3, q->sortClause) {
326 SortGroupClause *sgc = (SortGroupClause *)lfirst(lc3);
327 if (sgc->tleSortGroupRef == outer_te->ressortgroupref)
328 provsql_error("ORDER BY on aggregate results from "
329 "a subquery not supported");
330 }
331 foreach (lc3, q->groupClause) {
332 SortGroupClause *sgc = (SortGroupClause *)lfirst(lc3);
333 if (sgc->tleSortGroupRef == outer_te->ressortgroupref)
334 provsql_error("GROUP BY on aggregate results from "
335 "a subquery not supported");
336 }
337 }
338 }
339 }
340 }
341 }
342 }
343 ++attno;
344 }
345}
346
347/**
348 * @brief Inline CTE references as subqueries within a query.
349 *
350 * Replaces each non-recursive RTE_CTE entry in @p rtable with an
351 * RTE_SUBQUERY containing a copy of the CTE's query, looking up
352 * definitions in @p cteList. Recurses into newly inlined subqueries
353 * to handle nested CTE references (ctelevelsup > 0).
354 *
355 * @param rtable Range table to scan for RTE_CTE entries.
356 * @param cteList CTE definitions to look up names in.
357 */
358static void inline_ctes_in_rtable(List *rtable, List *cteList) {
359 ListCell *lc;
360 foreach (lc, rtable) {
361 RangeTblEntry *r = (RangeTblEntry *)lfirst(lc);
362 if (r->rtekind == RTE_CTE) {
363 ListCell *lc2;
364 foreach (lc2, cteList) {
365 CommonTableExpr *cte = (CommonTableExpr *)lfirst(lc2);
366 if (strcmp(cte->ctename, r->ctename) == 0) {
367 if (cte->cterecursive) {
368 provsql_error("Recursive CTEs not supported");
369 } else {
370 r->rtekind = RTE_SUBQUERY;
371 r->subquery = copyObject((Query *)cte->ctequery);
372 r->ctename = NULL;
373 r->ctelevelsup = 0;
374 /* Recurse: the inlined subquery may reference other CTEs
375 * from the same cteList */
376 inline_ctes_in_rtable(r->subquery->rtable, cteList);
377 }
378 break;
379 }
380 }
381 } else if (r->rtekind == RTE_SUBQUERY && r->subquery != NULL) {
382 /* Recurse into existing subqueries (e.g., UNION branches) to
383 * inline CTE references they may contain */
384 inline_ctes_in_rtable(r->subquery->rtable, cteList);
385 }
386 }
387}
388
389/**
390 * @brief Inline all CTE references in @p q as subqueries.
391 */
392static void inline_ctes(Query *q) {
393 if (q->cteList == NIL)
394 return;
395 inline_ctes_in_rtable(q->rtable, q->cteList);
396 q->cteList = NIL;
397}
398
399/**
400 * @brief Collect all provenance Var nodes reachable from @p q's range table.
401 *
402 * Walks every RTE in @p q->rtable:
403 * - @c RTE_RELATION: looks for a column named @c provsql of type UUID.
404 * - @c RTE_SUBQUERY: recursively calls @c process_query and splices the
405 * resulting provenance column back into the parent's column list, also
406 * patching outer Var attribute numbers if inner columns were removed.
407 * - @c RTE_CTE: non-recursive CTEs are inlined as @c RTE_SUBQUERY before
408 * the main loop, then processed as above. Recursive CTEs raise an error.
409 * - @c RTE_FUNCTION: handled when the function returns a single UUID column
410 * named @c provsql.
411 * - @c RTE_JOIN / @c RTE_VALUES / @c RTE_GROUP: handled passively (the
412 * underlying base-table RTEs supply the tokens).
413 *
414 * @param constants Extension OID cache.
415 * @param q Query whose range table is scanned (subquery RTEs are
416 * modified in place by the recursive call).
417 * @return List of @c Var nodes, one per provenance source; @c NIL if the
418 * query has no provenance-bearing relation.
419 */
420static List *get_provenance_attributes(const constants_t *constants, Query *q) {
421 List *prov_atts = NIL;
422
423 for(Index rteid = 1; rteid <= q->rtable->length; ++rteid) {
424 RangeTblEntry *r = list_nth_node(RangeTblEntry, q->rtable, rteid-1);
425
426 if (r->rtekind == RTE_RELATION) {
427 ListCell *lc;
428 AttrNumber attid = 1;
429
430 /* PG 14 and 15 leave the OLD/NEW rule-placeholder RTEs (relkind
431 * = RELKIND_VIEW, inFromCl = false) in the rewritten range table
432 * for any view body. PG 16+ removes them. They are never
433 * scanned and the planner does not build a RelOptInfo for them,
434 * so any Var we point at them later fails find_base_rel().
435 * Filter them out here; any post-rewrite RTE_RELATION whose
436 * relkind is still a view is one of these artifacts. */
437 if (r->relkind == RELKIND_VIEW)
438 continue;
439
440 foreach (lc, r->eref->colnames) {
441 const char *v = strVal(lfirst(lc));
442
443 if (!strcmp(v, PROVSQL_COLUMN_NAME) &&
444 get_atttype(r->relid, attid) == constants->OID_TYPE_UUID) {
445 prov_atts =
446 lappend(prov_atts,
447 make_provenance_attribute(constants, q, r, rteid, attid));
448 }
449
450 ++attid;
451 }
452 } else if (r->rtekind == RTE_SUBQUERY) {
453 bool *inner_removed = NULL;
454 int old_targetlist_length =
455 r->subquery->targetList ? r->subquery->targetList->length : 0;
456 Query *new_subquery =
457 process_query(constants, r->subquery, &inner_removed, false);
458 if (new_subquery != NULL) {
459 int i = 0;
460 int *offset = (int *)palloc(old_targetlist_length * sizeof(int));
461 unsigned varattnoprovsql;
462 ListCell *cell, *prev;
463
464 r->subquery = new_subquery;
465
466 if (inner_removed != NULL) {
467 for (cell = list_head(r->eref->colnames), prev = NULL;
468 cell != NULL;) {
469 if (inner_removed[i]) {
470 r->eref->colnames =
471 my_list_delete_cell(r->eref->colnames, cell, prev);
472 if (prev)
473 cell = my_lnext(r->eref->colnames, prev);
474 else
475 cell = list_head(r->eref->colnames);
476 } else {
477 prev = cell;
478 cell = my_lnext(r->eref->colnames, cell);
479 }
480 ++i;
481 }
482 for (i = 0; i < old_targetlist_length; ++i) {
483 offset[i] =
484 (i == 0 ? 0 : offset[i - 1]) - (inner_removed[i] ? 1 : 0);
485 }
486
487 reduce_varattno_by_offset(q->targetList, rteid, offset);
488 }
489
490 varattnoprovsql = 0;
491 for (cell = list_head(new_subquery->targetList); cell != NULL;
492 cell = my_lnext(new_subquery->targetList, cell)) {
493 TargetEntry *te = (TargetEntry *)lfirst(cell);
494 ++varattnoprovsql;
495 if (te->resname && !strcmp(te->resname, PROVSQL_COLUMN_NAME))
496 break;
497 }
498
499 if (cell != NULL) {
500 r->eref->colnames = list_insert_nth(r->eref->colnames, varattnoprovsql-1,
501 makeString(pstrdup(PROVSQL_COLUMN_NAME)));
502 prov_atts =
503 lappend(prov_atts, make_provenance_attribute(
504 constants, q, r, rteid, varattnoprovsql));
505 }
506 fix_type_of_aggregation_result(constants, q, rteid,
507 r->subquery->targetList);
508 }
509 } else if (r->rtekind == RTE_JOIN) {
510 if (r->jointype == JOIN_INNER || r->jointype == JOIN_LEFT ||
511 r->jointype == JOIN_FULL || r->jointype == JOIN_RIGHT) {
512 // Nothing to do, there will also be RTE entries for the tables
513 // that are part of the join, from which we will extract the
514 // provenance information
515 } else { // Semijoin (should be feasible, but check whether the second
516 // provenance information is available) Antijoin (feasible with
517 // negation)
518 provsql_error("JOIN type not supported");
519 }
520 } else if (r->rtekind == RTE_FUNCTION) {
521 ListCell *lc;
522 AttrNumber attid = 1;
523
524 foreach (lc, r->functions) {
525 RangeTblFunction *func = (RangeTblFunction *)lfirst(lc);
526
527 if (func->funccolcount == 1) {
528 FuncExpr *expr = (FuncExpr *)func->funcexpr;
529 if (expr->funcresulttype == constants->OID_TYPE_UUID &&
530 !strcmp(get_rte_attribute_name(r, attid), PROVSQL_COLUMN_NAME)) {
531 prov_atts = lappend(prov_atts, make_provenance_attribute(
532 constants, q, r, rteid, attid));
533 }
534 } else {
535 provsql_error("FROM function with multiple output "
536 "attributes not supported");
537 }
538
539 attid += func->funccolcount;
540 }
541 } else if (r->rtekind == RTE_VALUES) {
542 // Nothing to do, no provenance attribute in literal values
543#if PG_VERSION_NUM >= 180000
544 } else if (r->rtekind == RTE_GROUP) {
545 // Introduced in PostgreSQL 18, we already handle group by from
546 // groupClause
547#endif
548 } else {
549 provsql_error("FROM clause not supported");
550 }
551 }
552
553 return prov_atts;
554}
555
556/* -------------------------------------------------------------------------
557 * Target-list surgery
558 * ------------------------------------------------------------------------- */
559
560/**
561 * @brief Strip provenance UUID columns from @p q's SELECT list.
562 *
563 * Scans the target list and removes every @c Var entry whose column name is
564 * @c provsql and whose type is UUID. The remaining entries have their
565 * @c resno values decremented to fill the gaps.
566 *
567 * @param constants Extension OID cache.
568 * @param q Query to modify in place.
569 * @param removed Out-param: allocated boolean array (length =
570 * original target list length) where @c true means the
571 * corresponding entry was removed. The caller must
572 * @c pfree this array when done.
573 * @return Bitmapset of @c ressortgroupref values whose entries were
574 * removed (so the caller can clean up GROUP BY / ORDER BY).
575 */
576static Bitmapset *
578 bool **removed) {
579 int nbRemoved = 0;
580 int i = 0;
581 Bitmapset *ressortgrouprefs = NULL;
582 ListCell *cell, *prev;
583 *removed = (bool *)palloc(q->targetList->length * sizeof(bool));
584
585 for (cell = list_head(q->targetList), prev = NULL; cell != NULL;) {
586 TargetEntry *rt = (TargetEntry *)lfirst(cell);
587 (*removed)[i] = false;
588
589 if (rt->expr->type == T_Var) {
590 Var *v = (Var *)rt->expr;
591
592 if (v->vartype == constants->OID_TYPE_UUID) {
593 const char *colname;
594
595 if (rt->resname)
596 colname = rt->resname;
597 else {
598 /* This case occurs, for example, when grouping by a column
599 * that is projected out */
600 RangeTblEntry *r = (RangeTblEntry *)list_nth(q->rtable, v->varno - 1);
601 colname = strVal(list_nth(r->eref->colnames, v->varattno - 1));
602 }
603
604 if (!strcmp(colname, PROVSQL_COLUMN_NAME)) {
605 q->targetList = my_list_delete_cell(q->targetList, cell, prev);
606
607 (*removed)[i] = true;
608 ++nbRemoved;
609
610 if (rt->ressortgroupref > 0)
611 ressortgrouprefs =
612 bms_add_member(ressortgrouprefs, rt->ressortgroupref);
613 }
614 }
615 }
616
617 if ((*removed)[i]) {
618 if (prev) {
619 cell = my_lnext(q->targetList, prev);
620 } else {
621 cell = list_head(q->targetList);
622 }
623 } else {
624 rt->resno -= nbRemoved;
625 prev = cell;
626 cell = my_lnext(q->targetList, cell);
627 }
628
629 ++i;
630 }
631
632 return ressortgrouprefs;
633}
634
635/**
636 * @brief Semiring operation used to combine provenance tokens.
637 *
638 * @c SR_TIMES corresponds to the multiplicative operation (joins, Cartesian
639 * products), @c SR_PLUS to the additive operation (duplicate elimination), and
640 * @c SR_MONUS to the monus / set-difference operation (EXCEPT).
641 *
642 * @see https://provsql.org/lean-docs/Provenance/QueryRewriting.html
643 * Lean 4 formalization of rewriting rules (R1)--(R5) and correctness
644 * theorem @c Query.rewriting_valid.
645 */
646typedef enum {
647 SR_PLUS, ///< Semiring addition (UNION, SELECT DISTINCT)
648 SR_MONUS, ///< Semiring monus / set difference (EXCEPT)
649 SR_TIMES ///< Semiring multiplication (JOIN, Cartesian product)
651
652/* -------------------------------------------------------------------------
653 * Semiring expression builders
654 * ------------------------------------------------------------------------- */
655
656/**
657 * @brief Wrap @p toExpr in a @c provenance_eq gate if @p fromOpExpr is an
658 * equality between two tracked columns.
659 *
660 * Used for where-provenance: each equijoin condition (and some WHERE
661 * equalities) introduces an @c eq gate that records which attribute positions
662 * were compared. Because this function is also called for WHERE predicates,
663 * it applies extra guards and silently returns @p toExpr unchanged when the
664 * expression does not match the expected shape (both sides must be @c Var
665 * nodes, possibly wrapped in a @c RelabelType).
666 *
667 * @param constants Extension OID cache.
668 * @param fromOpExpr The equality @c OpExpr to inspect.
669 * @param toExpr Existing provenance expression to wrap.
670 * @param columns Per-RTE column-numbering array. EQ gate positions
671 * carry the same sequential-number caveat as PROJECT
672 * gate positions (see @c build_column_map()); they are
673 * only correct when each operand's RTE is either a join
674 * RTE or a subquery, not a bare provenance-tracked base
675 * table.
676 * @return @p toExpr wrapped in @c provenance_eq(toExpr, col1, col2), or
677 * @p toExpr unchanged if the shape is unsupported.
678 */
679static Expr *add_eq_from_OpExpr_to_Expr(const constants_t *constants,
680 OpExpr *fromOpExpr, Expr *toExpr,
681 int **columns) {
682 Datum first_arg;
683 Datum second_arg;
684 FuncExpr *fc;
685 Const *c1;
686 Const *c2;
687 Var *v1;
688 Var *v2;
689
690 if (my_lnext(fromOpExpr->args, list_head(fromOpExpr->args))) {
691 /* Sometimes Var is nested within a RelabelType */
692 if (IsA(linitial(fromOpExpr->args), Var)) {
693 v1 = linitial(fromOpExpr->args);
694 } else if (IsA(linitial(fromOpExpr->args), RelabelType)) {
695 /* In the WHERE case it can be a Const */
696 RelabelType *rt1 = linitial(fromOpExpr->args);
697 if (IsA(rt1->arg, Var)) { /* Can be Param in the WHERE case */
698 v1 = (Var *)rt1->arg;
699 } else
700 return toExpr;
701 } else
702 return toExpr;
703 if (!columns[v1->varno - 1])
704 return toExpr;
705 first_arg = Int16GetDatum(columns[v1->varno - 1][v1->varattno - 1]);
706
707 if (IsA(lsecond(fromOpExpr->args), Var)) {
708 v2 = lsecond(fromOpExpr->args);
709 } else if (IsA(lsecond(fromOpExpr->args), RelabelType)) {
710 /* In the WHERE case it can be a Const */
711 RelabelType *rt2 = lsecond(fromOpExpr->args);
712 if (IsA(rt2->arg, Var)) { /* Can be Param in the WHERE case */
713 v2 = (Var *)rt2->arg;
714 } else
715 return toExpr;
716 } else
717 return toExpr;
718 if (!columns[v2->varno - 1])
719 return toExpr;
720 second_arg = Int16GetDatum(columns[v2->varno - 1][v2->varattno - 1]);
721
722 fc = makeNode(FuncExpr);
723 fc->funcid = constants->OID_FUNCTION_PROVENANCE_EQ;
724 fc->funcvariadic = false;
725 fc->funcresulttype = constants->OID_TYPE_UUID;
726 fc->location = -1;
727
728 c1 = makeConst(constants->OID_TYPE_INT, -1, InvalidOid, sizeof(int16),
729 first_arg, false, true);
730
731 c2 = makeConst(constants->OID_TYPE_INT, -1, InvalidOid, sizeof(int16),
732 second_arg, false, true);
733
734 fc->args = list_make3(toExpr, c1, c2);
735 return (Expr *)fc;
736 }
737 return toExpr;
738}
739
740/**
741 * @brief Walk a join-condition or WHERE quals node and add @c eq gates for
742 * every equality it contains.
743 *
744 * Dispatches to @c add_eq_from_OpExpr_to_Expr for simple @c OpExpr nodes
745 * and iterates over the arguments of an AND @c BoolExpr. OR/NOT inside a
746 * join ON clause are rejected with an error.
747 *
748 * @param constants Extension OID cache.
749 * @param quals Root of the quals tree (@c OpExpr or @c BoolExpr), or
750 * @c NULL (in which case @p result is returned unchanged).
751 * @param result Provenance expression to wrap.
752 * @param columns Per-RTE column-numbering array.
753 * @return Updated provenance expression with zero or more @c eq gates added.
754 */
755static Expr *add_eq_from_Quals_to_Expr(const constants_t *constants,
756 Node *quals, Expr *result,
757 int **columns) {
758 OpExpr *oe;
759
760 if (!quals)
761 return result;
762
763 if (IsA(quals, OpExpr)) {
764 oe = (OpExpr *)quals;
765 result = add_eq_from_OpExpr_to_Expr(constants, oe, result, columns);
766 } /* Sometimes OpExpr is nested within a BoolExpr */
767 else if (IsA(quals, BoolExpr)) {
768 BoolExpr *be = (BoolExpr *)quals;
769 /* In some cases, there can be an OR or a NOT specified with ON clause */
770 if (be->boolop == OR_EXPR || be->boolop == NOT_EXPR) {
771 provsql_error("Boolean operators OR and NOT in a join...on "
772 "clause are not supported");
773 } else {
774 ListCell *lc2;
775 foreach (lc2, be->args) {
776 if (IsA(lfirst(lc2), OpExpr)) {
777 oe = (OpExpr *)lfirst(lc2);
778 result = add_eq_from_OpExpr_to_Expr(constants, oe, result, columns);
779 }
780 }
781 }
782 } else { /* Handle other cases */
783 }
784 return result;
785}
786
787/**
788 * @brief Build the per-row provenance token for an aggregate rewrite.
789 *
790 * Used by both @c make_aggregation_expression (for the agg_token /
791 * @c provenance_semimod path) and @c make_rv_aggregate_expression (for
792 * the inline RV-aggregate path). Combines @p prov_atts via
793 * @c provenance_times (under @c SR_TIMES) or @c provenance_monus
794 * (under @c SR_MONUS); a single @c prov_att is returned as-is.
795 *
796 * @return An @c Expr returning UUID; never @c NULL.
797 */
798static Expr *combine_prov_atts(const constants_t *constants,
799 List *prov_atts, semiring_operation op) {
800 FuncExpr *combine;
801
802 if (my_lnext(prov_atts, list_head(prov_atts)) == NULL)
803 return (Expr *)linitial(prov_atts);
804
805 combine = makeNode(FuncExpr);
806 if (op == SR_TIMES) {
807 ArrayExpr *array = makeNode(ArrayExpr);
808
809 combine->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
810 combine->funcvariadic = true;
811
812 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
813 array->element_typeid = constants->OID_TYPE_UUID;
814 array->elements = prov_atts;
815 array->location = -1;
816
817 combine->args = list_make1(array);
818 } else { // SR_MONUS
819 combine->funcid = constants->OID_FUNCTION_PROVENANCE_MONUS;
820 combine->args = prov_atts;
821 }
822 combine->funcresulttype = constants->OID_TYPE_UUID;
823 combine->location = -1;
824 return (Expr *)combine;
825}
826
827/**
828 * @brief Inline rewrite of an RV-returning aggregate into the same
829 * aggregate over provenance-wrapped per-row arguments.
830 *
831 * Originally Phase 1 of the SUM-over-RV story (see @c aggregation-of-rvs.md);
832 * extended to any aggregate whose result type is @c random_variable
833 * (e.g. @c provsql.sum, @c provsql.avg). Replaces @c agg(@c x) with an
834 * Aggref whose per-row argument is lifted through @c rv_aggregate_semimod
835 * to attach the row's provenance: each row contributes
836 * @c mixture(prov_token, X_i, as_random(0)). The aggregate itself
837 * (@c aggfnoid) is preserved verbatim, so its SFUNC / FFUNC decide what
838 * gate shape to build from the per-row mixtures. In particular:
839 * - @c sum(random_variable) collects the mixtures into a single
840 * @c gate_arith @c PLUS root, realising
841 * @f$\mathrm{SUM}(x) = \sum_i \mathbf{1}\{\varphi_i\} \cdot X_i@f$;
842 * - @c avg(random_variable) walks each mixture to recover @c prov_i
843 * and emits @c gate_arith(DIV, @c sum(mixture(p_i,x_i,0)),
844 * @c sum(mixture(p_i,1,0))), the natural lift of "AVG = SUM / COUNT"
845 * into the @c random_variable algebra.
846 *
847 * Routing happens at @c make_aggregation_expression on
848 * @c agg_ref->aggtype @c == @c OID_TYPE_RANDOM_VARIABLE, so any future
849 * RV-returning aggregate inherits the same per-row provenance wrap
850 * without further C-side dispatch.
851 *
852 * SR_PLUS (UNION outer level) is handled by the caller; this builder
853 * never runs for SR_PLUS.
854 */
855static Expr *make_rv_aggregate_expression(const constants_t *constants,
856 Aggref *agg_ref, List *prov_atts,
858 Expr *prov_expr = combine_prov_atts(constants, prov_atts, op);
859 Expr *rv_arg = ((TargetEntry *)linitial(agg_ref->args))->expr;
860 FuncExpr *wrap;
861 Aggref *new_agg;
862 TargetEntry *te;
863
864 /* Wrap the per-row RV in mixture(prov, rv, as_random(0)) via the
865 * rv_aggregate_semimod SQL helper. Going through the helper avoids
866 * having to look up the specific (uuid, random_variable,
867 * random_variable) overload of mixture and the (double precision)
868 * overload of as_random at OID-cache time. */
869 wrap = makeNode(FuncExpr);
870 wrap->funcid = constants->OID_FUNCTION_RV_AGGREGATE_SEMIMOD;
871 wrap->funcresulttype = constants->OID_TYPE_RANDOM_VARIABLE;
872 wrap->args = list_make2(prov_expr, rv_arg);
873 wrap->location = -1;
874
875 /* Rebuild an Aggref calling the SAME aggregate (sum_rv, avg_rv, ...),
876 * but with the argument now wrapped. Inherit the original Aggref's
877 * clause positioning; aggargtypes / aggtype stay random_variable. */
878 te = makeNode(TargetEntry);
879 te->resno = 1;
880 te->expr = (Expr *)wrap;
881
882 new_agg = makeNode(Aggref);
883 new_agg->aggfnoid = agg_ref->aggfnoid;
884 new_agg->aggtype = constants->OID_TYPE_RANDOM_VARIABLE;
885 new_agg->aggargtypes = list_make1_oid(constants->OID_TYPE_RANDOM_VARIABLE);
886 new_agg->aggkind = AGGKIND_NORMAL;
887 new_agg->aggtranstype = InvalidOid;
888 new_agg->args = list_make1(te);
889 new_agg->location = agg_ref->location;
890#if PG_VERSION_NUM >= 140000
891 new_agg->aggno = new_agg->aggtransno = -1;
892#endif
893
894 return (Expr *)new_agg;
895}
896
897/**
898 * @brief Build the provenance expression for a single aggregate function.
899 *
900 * For @c SR_PLUS (union context) returns the first provenance attribute
901 * directly. For @c SR_TIMES or @c SR_MONUS, constructs:
902 * @code
903 * provenance_aggregate(fn_oid, result_type,
904 * original_aggref,
905 * array_agg(provenance_semimod(arg, times_or_monus_token)))
906 * @endcode
907 * COUNT(*) and COUNT(expr) are remapped to SUM so that the semimodule
908 * semantics (scalar × token → token) work correctly.
909 *
910 * @param constants Extension OID cache.
911 * @param agg_ref The original @c Aggref node from the query.
912 * @param prov_atts List of provenance @c Var nodes.
913 * @param op Semiring operation (determines how tokens are combined).
914 * @return Provenance expression of type @c agg_token.
915 */
916static Expr *make_aggregation_expression(const constants_t *constants,
917 Aggref *agg_ref, List *prov_atts,
919 Expr *result;
920 FuncExpr *expr, *expr_s;
921 Aggref *agg = makeNode(Aggref);
922 FuncExpr *plus = makeNode(FuncExpr);
923 TargetEntry *te_inner = makeNode(TargetEntry);
924 Const *fn = makeNode(Const);
925 Const *typ = makeNode(Const);
926
927 if (op == SR_PLUS) {
928 result = linitial(prov_atts);
929 } else {
930 Oid aggregation_function = agg_ref->aggfnoid;
931
932 /* Aggregates that return random_variable (sum_rv, avg_rv, and any
933 * future RV-returning aggregate) get a different rewrite: instead
934 * of going through provenance_semimod (which builds a gate_value
935 * from CAST(val AS VARCHAR), nonsensical for an RV), each per-row
936 * argument is wrapped in mixture(prov, rv, as_random(0)) and the
937 * original aggregate's SFUNC / FFUNC decide what gate shape to
938 * build from the resulting mixtures. See aggregation-of-rvs.md. */
939 if (OidIsValid(constants->OID_TYPE_RANDOM_VARIABLE) &&
940 agg_ref->aggtype == constants->OID_TYPE_RANDOM_VARIABLE) {
941 return make_rv_aggregate_expression(constants, agg_ref, prov_atts, op);
942 }
943
944 if (my_lnext(prov_atts, list_head(prov_atts)) == NULL)
945 expr = linitial(prov_atts);
946 else {
947 expr = makeNode(FuncExpr);
948 if (op == SR_TIMES) {
949 ArrayExpr *array = makeNode(ArrayExpr);
950
951 expr->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
952 expr->funcvariadic = true;
953
954 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
955 array->element_typeid = constants->OID_TYPE_UUID;
956 array->elements = prov_atts;
957 array->location = -1;
958
959 expr->args = list_make1(array);
960 } else { // SR_MONUS
961 expr->funcid = constants->OID_FUNCTION_PROVENANCE_MONUS;
962 expr->args = prov_atts;
963 }
964 expr->funcresulttype = constants->OID_TYPE_UUID;
965 expr->location = -1;
966 }
967
968 // semimodule function
969 expr_s = makeNode(FuncExpr);
970 expr_s->funcid = constants->OID_FUNCTION_PROVENANCE_SEMIMOD;
971 expr_s->funcresulttype = constants->OID_TYPE_UUID;
972
973 // check the particular case of count
974 if (aggregation_function == F_COUNT_ ||
975 aggregation_function == F_COUNT_ANY) // count(*) or count(arg)
976 {
977 Const *one = makeConst(constants->OID_TYPE_INT, -1, InvalidOid,
978 sizeof(int32), Int32GetDatum(1), false, true);
979 expr_s->args = list_make2(one, expr);
980 aggregation_function = F_SUM_INT4;
981 } else {
982 expr_s->args =
983 list_make2(((TargetEntry *)linitial(agg_ref->args))->expr, expr);
984 }
985
986 expr_s->location = -1;
987
988 // aggregating all semirings in an array
989 te_inner->resno = 1;
990 te_inner->expr = (Expr *)expr_s;
991 agg->aggfnoid = constants->OID_FUNCTION_ARRAY_AGG;
992 agg->aggtype = constants->OID_TYPE_UUID_ARRAY;
993 agg->args = list_make1(te_inner);
994 agg->aggkind = AGGKIND_NORMAL;
995 agg->location = -1;
996#if PG_VERSION_NUM >= 140000
997 agg->aggno = agg->aggtransno = -1;
998#endif
999
1000 agg->aggargtypes = list_make1_oid(constants->OID_TYPE_UUID);
1001
1002 // final aggregation function
1003 plus->funcid = constants->OID_FUNCTION_PROVENANCE_AGGREGATE;
1004
1005 fn = makeConst(constants->OID_TYPE_INT, -1, InvalidOid, sizeof(int32),
1006 Int32GetDatum(aggregation_function), false, true);
1007
1008 typ = makeConst(constants->OID_TYPE_INT, -1, InvalidOid, sizeof(int32),
1009 Int32GetDatum(agg_ref->aggtype), false, true);
1010
1011 plus->funcresulttype = constants->OID_TYPE_AGG_TOKEN;
1012 plus->args = list_make4(fn, typ, agg_ref, agg);
1013 plus->location = -1;
1014
1015 result = (Expr *)plus;
1016 }
1017
1018 return result;
1019}
1020
1021/* -------------------------------------------------------------------------
1022 * HAVING / WHERE-on-aggregates rewriting
1023 * ------------------------------------------------------------------------- */
1024
1025/* Forward declaration needed because having_BoolExpr_to_provenance and
1026 * having_Expr_to_provenance_cmp are mutually recursive. */
1027static FuncExpr *having_Expr_to_provenance_cmp(Expr *expr, const constants_t *constants, bool negated);
1028
1029/* Forward declaration: defined alongside the other tree walkers
1030 * further down in the file. */
1031static bool needs_having_lift(Node *havingQual, const constants_t *constants);
1032
1033/**
1034 * @brief Convert a comparison @c OpExpr on aggregate results into a
1035 * @c provenance_cmp gate expression.
1036 *
1037 * Each argument of @p opExpr must be one of:
1038 * - A @c Var of type @c agg_token (or a @c FuncExpr implicit-cast wrapper
1039 * around one) → cast to UUID via @c agg_token_to_uuid.
1040 * - A scalar @c Const → wrapped in @c provenance_semimod(const, gate_one()).
1041 *
1042 * If @p negated is true the operator OID is replaced by its negator so that
1043 * NOT(a < b) becomes a >= b at the provenance level.
1044 *
1045 * @param opExpr The comparison expression from the HAVING clause.
1046 * @param constants Extension OID cache.
1047 * @param negated Whether the expression appears under a NOT.
1048 * @return A @c provenance_cmp(lhs, op_oid, rhs) @c FuncExpr.
1049 */
1050static FuncExpr *having_OpExpr_to_provenance_cmp(OpExpr *opExpr, const constants_t *constants, bool negated) {
1051 FuncExpr *cmpExpr;
1052 Node *arguments[2];
1053 Const *oid;
1054 Oid opno = opExpr->opno;
1055
1056 for (unsigned i = 0; i < 2; ++i) {
1057 Node *node = (Node *)lfirst(list_nth_cell(opExpr->args, i));
1058
1059 if (IsA(node, FuncExpr)) {
1060 FuncExpr *fe = (FuncExpr *)node;
1061 if (fe->funcformat == COERCE_IMPLICIT_CAST ||
1062 fe->funcformat == COERCE_EXPLICIT_CAST) {
1063 if (fe->args->length == 1)
1064 node = lfirst(list_head(fe->args));
1065 }
1066 }
1067
1068 if (IsA(node, FuncExpr)) {
1069 FuncExpr *fe = (FuncExpr *)node;
1070 if (fe->funcid == constants->OID_FUNCTION_PROVENANCE_AGGREGATE) {
1071 // We need to add an explicit cast to UUID
1072 FuncExpr *castToUUID = makeNode(FuncExpr);
1073
1074 castToUUID->funcid = constants->OID_FUNCTION_AGG_TOKEN_UUID;
1075 castToUUID->funcresulttype = constants->OID_TYPE_UUID;
1076 castToUUID->args = list_make1(fe);
1077 castToUUID->location = -1;
1078
1079 arguments[i] = (Node *)castToUUID;
1080 } else {
1081 provsql_error("cannot handle complex HAVING expressions");
1082 }
1083 } else if (IsA(node, Var)) {
1084 Var *v = (Var *)node;
1085
1086 if (v->vartype == constants->OID_TYPE_AGG_TOKEN) {
1087 // We need to add an explicit cast to UUID
1088 FuncExpr *castToUUID = makeNode(FuncExpr);
1089
1090 castToUUID->funcid = constants->OID_FUNCTION_AGG_TOKEN_UUID;
1091 castToUUID->funcresulttype = constants->OID_TYPE_UUID;
1092 castToUUID->args = list_make1(v);
1093 castToUUID->location = -1;
1094
1095 arguments[i] = (Node *)castToUUID;
1096 } else {
1097 provsql_error("cannot handle complex HAVING expressions");
1098 }
1099 } else if (IsA(node, Const)) {
1100 Const *literal = (Const *)node;
1101 FuncExpr *oneExpr, *semimodExpr;
1102
1103 // gate_one() expression
1104 oneExpr = makeNode(FuncExpr);
1105 oneExpr->funcid = constants->OID_FUNCTION_GATE_ONE;
1106 oneExpr->funcresulttype = constants->OID_TYPE_UUID;
1107 oneExpr->args = NIL;
1108 oneExpr->location = -1;
1109
1110 // provenance_semimod(literal, gate_one())
1111 semimodExpr = makeNode(FuncExpr);
1112 semimodExpr->funcid = constants->OID_FUNCTION_PROVENANCE_SEMIMOD;
1113 semimodExpr->funcresulttype = constants->OID_TYPE_UUID;
1114 semimodExpr->args = list_make2((Expr *)literal, (Expr *)oneExpr);
1115 semimodExpr->location = -1;
1116
1117 arguments[i] = (Node *)semimodExpr;
1118 } else {
1119 provsql_error("cannot handle complex HAVING expressions");
1120 }
1121 }
1122
1123 if (negated) {
1124 opno = get_negator(opno);
1125 if (!opno)
1126 provsql_error("Missing negator");
1127 }
1128
1129 oid = makeConst(constants->OID_TYPE_INT, -1, InvalidOid, sizeof(int32),
1130 Int32GetDatum(opno), false, true);
1131
1132 cmpExpr = makeNode(FuncExpr);
1133 cmpExpr->funcid = constants->OID_FUNCTION_PROVENANCE_CMP;
1134 cmpExpr->funcresulttype = constants->OID_TYPE_UUID;
1135 cmpExpr->args = list_make3(arguments[0], oid, arguments[1]);
1136 cmpExpr->location = opExpr->location;
1137
1138 return cmpExpr;
1139}
1140
1141/**
1142 * @brief Convert a Boolean combination of HAVING comparisons into a
1143 * @c provenance_times / @c provenance_plus gate expression.
1144 *
1145 * Applies De Morgan duality when @p negated is true: AND becomes
1146 * @c provenance_plus (OR) and vice-versa. NOT is handled by flipping
1147 * @p negated and delegating to @c having_Expr_to_provenance_cmp.
1148 *
1149 * @param be Boolean expression from the HAVING clause.
1150 * @param constants Extension OID cache.
1151 * @param negated Whether the expression appears under a NOT.
1152 * @return A @c FuncExpr combining the sub-expressions.
1153 */
1154static FuncExpr *having_BoolExpr_to_provenance(BoolExpr *be, const constants_t *constants, bool negated) {
1155 if(be->boolop == NOT_EXPR) {
1156 Expr *expr = (Expr *) lfirst(list_head(be->args));
1157 return having_Expr_to_provenance_cmp(expr, constants, !negated);
1158 } else {
1159 FuncExpr *result;
1160 List *l = NULL;
1161 ListCell *lc;
1162 ArrayExpr *array = makeNode(ArrayExpr);
1163
1164 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
1165 array->element_typeid = constants->OID_TYPE_UUID;
1166 array->location = -1;
1167
1168 result = makeNode(FuncExpr);
1169 result->funcresulttype = constants->OID_TYPE_UUID;
1170 result->funcvariadic = true;
1171 result->location = be->location;
1172 result->args = list_make1(array);
1173
1174 if ((be->boolop == AND_EXPR && !negated) || (be->boolop == OR_EXPR && negated))
1175 result->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
1176 else if ((be->boolop == AND_EXPR && negated) || (be->boolop == OR_EXPR && !negated))
1177 result->funcid = constants->OID_FUNCTION_PROVENANCE_PLUS;
1178 else
1179 provsql_error("Unknown Boolean operator");
1180
1181 foreach (lc, be->args) {
1182 Expr *expr = (Expr *)lfirst(lc);
1183 FuncExpr *arg = having_Expr_to_provenance_cmp(expr, constants, negated);
1184 l = lappend(l, arg);
1185 }
1186
1187 array->elements = l;
1188
1189 return result;
1190 }
1191}
1192
1193/**
1194 * @brief Dispatch a HAVING sub-expression to the appropriate converter.
1195 *
1196 * Entry point for the mutual recursion between
1197 * @c having_BoolExpr_to_provenance and @c having_OpExpr_to_provenance_cmp.
1198 *
1199 * @param expr Sub-expression to convert (@c BoolExpr or @c OpExpr).
1200 * @param constants Extension OID cache.
1201 * @param negated Whether the expression appears under a NOT.
1202 * @return Converted @c FuncExpr.
1203 */
1204static FuncExpr *having_Expr_to_provenance_cmp(Expr *expr, const constants_t *constants, bool negated)
1205{
1206 if (IsA(expr, BoolExpr))
1207 return having_BoolExpr_to_provenance((BoolExpr *)expr, constants, negated);
1208 else if (IsA(expr, OpExpr))
1209 return having_OpExpr_to_provenance_cmp((OpExpr *)expr, constants, negated);
1210 else
1211 provsql_error("Unknown structure within Boolean expression");
1212}
1213
1214/* -------------------------------------------------------------------------
1215 * Random-variable WHERE-clause rewriting
1216 *
1217 * Mirror of the HAVING trio above. An OpExpr whose @c opfuncid matches
1218 * one of the @c random_variable_{eq,ne,le,lt,ge,gt} procedures is an
1219 * RV comparison; the planner hook lifts it out of @c jointree->quals,
1220 * builds an equivalent @c provenance_cmp(left_uuid, op_oid, right_uuid)
1221 * @c FuncExpr, and conjoins the resulting UUID into the row's
1222 * provenance via @c provenance_times. The lifted WHERE conjunct is
1223 * removed (or the whole WHERE replaced by @c NULL when only RV cmps
1224 * were present); what remains is purely Boolean and the executor
1225 * evaluates it in the usual way. The RV-cmp operators themselves are
1226 * boolean placeholders -- their procedure raises if reached, which can
1227 * happen only when the planner hook is bypassed (e.g. provsql.active
1228 * off).
1229 * ------------------------------------------------------------------------- */
1230
1231/**
1232 * @brief Test whether @p funcoid is one of the @c random_variable_*
1233 * comparison procedures, and if so return its
1234 * @c ComparisonOperator index.
1235 *
1236 * @param constants Extension OID cache.
1237 * @param funcoid Procedure OID to test (typically @c OpExpr->opfuncid).
1238 * @return Index in @c [0..6) on match, @c -1 otherwise. Match indices
1239 * line up with @c ComparisonOperator (EQ=0, NE=1, LE=2, LT=3,
1240 * GE=4, GT=5).
1241 */
1242static int rv_cmp_index(const constants_t *constants, Oid funcoid)
1243{
1244 for (int i = 0; i < 6; ++i) {
1245 if (funcoid == constants->OID_FUNCTION_RV_CMP[i])
1246 return i;
1247 }
1248 return -1;
1249}
1250
1251/**
1252 * @brief Wrap an expression returning @c random_variable in a
1253 * binary-coercible cast to @c uuid.
1254 *
1255 * Operand of the comparison may be a Var, a constant lifted by an
1256 * implicit cast, or another OpExpr (e.g. <tt>a + b</tt>).
1257 * @c random_variable and @c uuid share the same byte layout, so we
1258 * emit a @c RelabelType node -- the planner sees a zero-cost type
1259 * relabel, the executor never dispatches through a runtime
1260 * conversion function.
1261 */
1262static Expr *
1263wrap_random_variable_uuid(Node *operand, const constants_t *constants)
1264{
1265 RelabelType *rt = makeNode(RelabelType);
1266 rt->arg = (Expr *) operand;
1267 rt->resulttype = constants->OID_TYPE_UUID;
1268 rt->resulttypmod = -1;
1269 rt->resultcollid = InvalidOid;
1270 rt->relabelformat = COERCE_IMPLICIT_CAST;
1271 rt->location = -1;
1272 return (Expr *) rt;
1273}
1274
1275/* Forward declaration: the BoolExpr and Expr walkers below are mutually
1276 * recursive (BoolExpr recurses into Expr for each AND/OR child). */
1277static FuncExpr *rv_Expr_to_provenance(Expr *expr,
1278 const constants_t *constants,
1279 bool negated);
1280
1281/**
1282 * @brief Convert a single RV-comparison @c OpExpr into a
1283 * @c provenance_cmp() FuncExpr returning UUID.
1284 *
1285 * If @p negated is true the operator OID is replaced by its negator
1286 * (so <tt>NOT (a &gt; b)</tt> becomes <tt>a &le; b</tt> at the
1287 * provenance level), exactly as @c having_OpExpr_to_provenance_cmp
1288 * does.
1289 *
1290 * @param opExpr The comparison expression from the WHERE clause.
1291 * Must satisfy @c rv_cmp_index(opExpr->opfuncid) &ge; 0;
1292 * callers are responsible for the type check.
1293 * @param constants Extension OID cache.
1294 * @param negated Whether the expression appears under a NOT.
1295 */
1296static FuncExpr *
1297rv_OpExpr_to_provenance_cmp(OpExpr *opExpr, const constants_t *constants,
1298 bool negated)
1299{
1300 FuncExpr *cmpExpr;
1301 Const *oid_const;
1302 Oid opno = opExpr->opno;
1303 Node *left = (Node *)linitial(opExpr->args);
1304 Node *right = (Node *)lsecond(opExpr->args);
1305
1306 if (negated) {
1307 opno = get_negator(opno);
1308 if (!opno)
1309 provsql_error("Missing negator for random_variable comparison");
1310 }
1311
1312 oid_const = makeConst(constants->OID_TYPE_INT, -1, InvalidOid,
1313 sizeof(int32), Int32GetDatum(opno), false, true);
1314
1315 cmpExpr = makeNode(FuncExpr);
1316 cmpExpr->funcid = constants->OID_FUNCTION_PROVENANCE_CMP;
1317 cmpExpr->funcresulttype = constants->OID_TYPE_UUID;
1318 cmpExpr->args = list_make3(
1319 wrap_random_variable_uuid(left, constants),
1320 oid_const,
1321 wrap_random_variable_uuid(right, constants));
1322 cmpExpr->location = opExpr->location;
1323
1324 return cmpExpr;
1325}
1326
1327/**
1328 * @brief Convert a Boolean combination of RV comparisons into a
1329 * @c provenance_times / @c provenance_plus expression.
1330 *
1331 * Same De Morgan handling as @c having_BoolExpr_to_provenance: under
1332 * negation, AND ↔ OR (which means PROVENANCE_TIMES ↔ PROVENANCE_PLUS).
1333 * NOT flips @c negated and recurses.
1334 */
1335static FuncExpr *
1336rv_BoolExpr_to_provenance(BoolExpr *be, const constants_t *constants,
1337 bool negated)
1338{
1339 FuncExpr *result;
1340 ArrayExpr *array;
1341 List *l = NIL;
1342 ListCell *lc;
1343
1344 if (be->boolop == NOT_EXPR) {
1345 Expr *child = (Expr *)linitial(be->args);
1346 return rv_Expr_to_provenance(child, constants, !negated);
1347 }
1348
1349 array = makeNode(ArrayExpr);
1350 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
1351 array->element_typeid = constants->OID_TYPE_UUID;
1352 array->location = -1;
1353
1354 result = makeNode(FuncExpr);
1355 result->funcresulttype = constants->OID_TYPE_UUID;
1356 result->funcvariadic = true;
1357 result->location = be->location;
1358 result->args = list_make1(array);
1359
1360 if ((be->boolop == AND_EXPR && !negated) ||
1361 (be->boolop == OR_EXPR && negated))
1362 result->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
1363 else if ((be->boolop == AND_EXPR && negated) ||
1364 (be->boolop == OR_EXPR && !negated))
1365 result->funcid = constants->OID_FUNCTION_PROVENANCE_PLUS;
1366 else
1367 provsql_error("Unknown Boolean operator in random_variable WHERE clause");
1368
1369 foreach (lc, be->args) {
1370 FuncExpr *arg = rv_Expr_to_provenance((Expr *)lfirst(lc),
1371 constants, negated);
1372 l = lappend(l, arg);
1373 }
1374 array->elements = l;
1375
1376 return result;
1377}
1378
1379/**
1380 * @brief Dispatch a WHERE sub-expression to the appropriate RV converter.
1381 *
1382 * Entry point for the mutual recursion between
1383 * @c rv_BoolExpr_to_provenance and @c rv_OpExpr_to_provenance_cmp.
1384 */
1385static FuncExpr *
1386rv_Expr_to_provenance(Expr *expr, const constants_t *constants, bool negated)
1387{
1388 if (IsA(expr, BoolExpr))
1389 return rv_BoolExpr_to_provenance((BoolExpr *)expr, constants, negated);
1390 if (IsA(expr, OpExpr)) {
1391 OpExpr *opExpr = (OpExpr *)expr;
1392 if (rv_cmp_index(constants, opExpr->opfuncid) >= 0)
1393 return rv_OpExpr_to_provenance_cmp(opExpr, constants, negated);
1394 }
1395 provsql_error("Unsupported sub-expression in random_variable WHERE clause "
1396 "(only Boolean combinations of RV comparisons are accepted)");
1397 return NULL; /* unreachable, silences -Wreturn-type */
1398}
1399
1400/**
1401 * @brief Test whether an Expr (sub-)tree contains any RV comparison.
1402 *
1403 * Used by the WHERE-clause extractor to decide whether a top-level
1404 * conjunct mentions any random_variable comparator and therefore
1405 * needs lifting (or, if the conjunct mixes RV and non-RV operators
1406 * in a way we cannot rewrite, errors).
1407 */
1408static bool
1409expr_contains_rv_cmp(Node *node, const constants_t *constants)
1410{
1411 if (node == NULL)
1412 return false;
1413 if (IsA(node, OpExpr)) {
1414 OpExpr *opExpr = (OpExpr *)node;
1415 if (rv_cmp_index(constants, opExpr->opfuncid) >= 0)
1416 return true;
1417 }
1418 if (IsA(node, BoolExpr)) {
1419 BoolExpr *be = (BoolExpr *)node;
1420 ListCell *lc;
1421 foreach (lc, be->args) {
1422 if (expr_contains_rv_cmp(lfirst(lc), constants))
1423 return true;
1424 }
1425 return false;
1426 }
1427 return false;
1428}
1429
1430/**
1431 * @brief Test whether @p expr is a Boolean combination of @em only
1432 * random_variable comparisons (no other leaves allowed).
1433 *
1434 * Mirrors @c check_expr_on_aggregate / @c check_boolexpr_on_aggregate
1435 * for the agg_token WHERE-to-HAVING migration path. Recursively
1436 * accepts:
1437 * - @c BoolExpr (AND/OR/NOT) all of whose children pass; and
1438 * - @c OpExpr matching one of the @c random_variable_* comparators.
1439 *
1440 * Anything else (a non-RV @c OpExpr, a @c Var, a @c Const, a non-cmp
1441 * @c FuncExpr) makes the expression mixed and unsupportable by the
1442 * RV-only walker, so the function returns @c false and the caller
1443 * raises a clear error.
1444 */
1445static bool
1446check_expr_on_rv(Expr *expr, const constants_t *constants)
1447{
1448 if (expr == NULL)
1449 return false;
1450 if (IsA(expr, OpExpr))
1451 return rv_cmp_index(constants, ((OpExpr *)expr)->opfuncid) >= 0;
1452 if (IsA(expr, BoolExpr)) {
1453 BoolExpr *be = (BoolExpr *)expr;
1454 ListCell *lc;
1455 foreach (lc, be->args) {
1456 if (!check_expr_on_rv((Expr *)lfirst(lc), constants))
1457 return false;
1458 }
1459 return true;
1460 }
1461 return false;
1462}
1463
1464/* The earlier RV-only WHERE walker (@c extract_rv_cmps_from_quals)
1465 * has been folded into the unified classifier
1466 * @c migrate_probabilistic_quals further down in this file; both the
1467 * agg_token and the random_variable migration paths are now special
1468 * cases of one walk over @c q->jointree->quals. See the comment on
1469 * @c qual_class for the routing matrix. */
1470
1471/**
1472 * @brief Build the combined provenance expression to be added to the SELECT list.
1473 *
1474 * Combines the tokens in @p prov_atts according to @p op:
1475 * - @c SR_PLUS → use the first token directly (union branch; the outer
1476 * @c array_agg / @c provenance_plus is added later if needed).
1477 * - @c SR_TIMES → wrap all tokens in @c provenance_times(...).
1478 * - @c SR_MONUS → wrap all tokens in @c provenance_monus(...).
1479 *
1480 * When @p aggregation or @p group_by_rewrite is true, wraps the result in
1481 * @c array_agg + @c provenance_plus to collapse groups. A @c provenance_delta
1482 * gate is added for plain aggregations without a HAVING clause.
1483 *
1484 * If a HAVING clause is present it is removed from @p q->havingQual and
1485 * converted into a provenance expression via @c having_Expr_to_provenance_cmp.
1486 *
1487 * If @c provsql_where_provenance is enabled, equality gates (@c provenance_eq)
1488 * are prepended for join conditions and WHERE equalities, and a projection gate
1489 * is appended if the output columns form a proper subset of the input columns.
1490 *
1491 * @param constants Extension OID cache.
1492 * @param q Query being rewritten (HAVING is cleared if present).
1493 * @param prov_atts List of provenance @c Var nodes.
1494 * @param aggregation True if the query contains aggregate functions.
1495 * @param group_by_rewrite True if a GROUP BY requires the plus-aggregate wrapper.
1496 * @param op Semiring operation to use for combining tokens.
1497 * @param columns Per-RTE column-numbering array (for where-provenance).
1498 * For provenance-tracked @c RTE_RELATION entries, the
1499 * -1 sentinel is used to identify them; the PROJECT
1500 * gate positions for their columns use @c varattno
1501 * rather than the query-order-dependent sequential
1502 * numbers (see @c build_column_map() for the
1503 * rationale).
1504 * @param nbcols Total number of non-provenance output columns.
1505 * @param wrap_assumed_boolean If true, wrap the result in
1506 * @c provenance_assumed_boolean so downstream
1507 * probability evaluators may treat it as Boolean.
1508 * @return The provenance @c Expr to be appended to the target list.
1509 */
1510static Expr *make_provenance_expression(const constants_t *constants, Query *q,
1511 List *prov_atts, bool aggregation,
1512 bool group_by_rewrite,
1513 semiring_operation op, int **columns,
1514 int nbcols, bool wrap_assumed_boolean) {
1515 Expr *result;
1516 ListCell *lc_v;
1517
1518 if (op == SR_PLUS) {
1519 result = linitial(prov_atts);
1520 } else {
1521 if (my_lnext(prov_atts, list_head(prov_atts)) == NULL) {
1522 result = linitial(prov_atts);
1523 } else {
1524 FuncExpr *expr = makeNode(FuncExpr);
1525 if (op == SR_TIMES) {
1526 ArrayExpr *array = makeNode(ArrayExpr);
1527
1528 expr->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
1529 expr->funcvariadic = true;
1530
1531 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
1532 array->element_typeid = constants->OID_TYPE_UUID;
1533 array->elements = prov_atts;
1534 array->location = -1;
1535
1536 expr->args = list_make1(array);
1537 } else { // SR_MONUS
1538 expr->funcid = constants->OID_FUNCTION_PROVENANCE_MONUS;
1539 expr->args = prov_atts;
1540 }
1541 expr->funcresulttype = constants->OID_TYPE_UUID;
1542 expr->location = -1;
1543
1544 result = (Expr *)expr;
1545 }
1546
1547 if (group_by_rewrite || aggregation) {
1548 Aggref *agg = makeNode(Aggref);
1549 FuncExpr *plus = makeNode(FuncExpr);
1550 TargetEntry *te_inner = makeNode(TargetEntry);
1551
1552 q->hasAggs = true;
1553
1554 te_inner->resno = 1;
1555 te_inner->expr = (Expr *)result;
1556
1557 agg->aggfnoid = constants->OID_FUNCTION_ARRAY_AGG;
1558 agg->aggtype = constants->OID_TYPE_UUID_ARRAY;
1559 agg->args = list_make1(te_inner);
1560 agg->aggkind = AGGKIND_NORMAL;
1561 agg->location = -1;
1562#if PG_VERSION_NUM >= 140000
1563 agg->aggno = agg->aggtransno = -1;
1564#endif
1565
1566 agg->aggargtypes = list_make1_oid(constants->OID_TYPE_UUID);
1567
1568 plus->funcid = constants->OID_FUNCTION_PROVENANCE_PLUS;
1569 plus->args = list_make1(agg);
1570 plus->funcresulttype = constants->OID_TYPE_UUID;
1571 plus->location = -1;
1572
1573 result = (Expr *)plus;
1574 }
1575
1576 /* HAVING quals come in two flavours. A qual that references an
1577 * agg_token Var or a provenance_aggregate() wrapper must be lifted
1578 * into a provenance_cmp gate so the per-group truth value is
1579 * carried by the provenance circuit (and the corresponding gate_agg
1580 * remains evaluable). Anything else -- a deterministic scalar
1581 * predicate, or one over random_variable aggregates collapsed by
1582 * expected() / variance() / moment() to a plain double -- is left
1583 * in q->havingQual for PostgreSQL to evaluate natively, and the
1584 * per-group provenance still gets a delta wrapper. */
1585 {
1586 bool lift_having = q->havingQual != NULL &&
1587 needs_having_lift((Node *) q->havingQual, constants);
1588
1589 if (aggregation && !lift_having) {
1590 FuncExpr *deltaExpr = makeNode(FuncExpr);
1591
1592 // adding the delta gate to the provenance circuit
1593 deltaExpr->funcid = constants->OID_FUNCTION_PROVENANCE_DELTA;
1594 deltaExpr->args = list_make1(result);
1595 deltaExpr->funcresulttype = constants->OID_TYPE_UUID;
1596 deltaExpr->location = -1;
1597
1598 result = (Expr *)deltaExpr;
1599 }
1600
1601 if (lift_having) {
1602 result = (Expr*) having_Expr_to_provenance_cmp((Expr*)q->havingQual, constants, false);
1603 q->havingQual = NULL;
1604 }
1605 }
1606 }
1607
1608 /* Part to handle eq gates used for where-provenance.
1609 * Placed before projection gates because they need
1610 * to be deeper in the provenance tree. */
1611 if (provsql_where_provenance && q->jointree) {
1612 ListCell *lc;
1613 foreach (lc, q->jointree->fromlist) {
1614 if (IsA(lfirst(lc), JoinExpr)) {
1615 JoinExpr *je = (JoinExpr *)lfirst(lc);
1616 /* Study equalities coming from From clause */
1617 result =
1618 add_eq_from_Quals_to_Expr(constants, je->quals, result, columns);
1619 }
1620 }
1621 /* Study equalities coming from WHERE clause */
1622 result = add_eq_from_Quals_to_Expr(constants, q->jointree->quals, result,
1623 columns);
1624 }
1625
1627 ArrayExpr *array = makeNode(ArrayExpr);
1628 FuncExpr *fe = makeNode(FuncExpr);
1629 bool projection = false;
1630 int nb_column = 0;
1631 /* Cumulative offset of each RTE within the TIMES gate's concatenated
1632 * locator vector. WhereCircuit::evaluate(TIMES) appends the locator
1633 * vector of each child input in q->rtable order, so a column at
1634 * varattno k of the i-th provenance-tracked base RTE lands at
1635 * prov_offset[i] + k in the concat. varattno alone (the recent fix
1636 * documented at the top of this file) is correct only when there is a
1637 * single provenance-tracked input; for multi-input joins it omits the
1638 * preceding inputs' nb_user_cols and the project gate then reads from
1639 * the wrong table's locator slice.
1640 *
1641 * 1-indexed by rteid for direct indexing via Var->varno; entry 0 is
1642 * unused. Length q->rtable->length + 1. */
1643 int *prov_offset = (int *)palloc0((q->rtable->length + 1) * sizeof(int));
1644 int cum = 0;
1645 Index r;
1646
1647 fe->funcid = constants->OID_FUNCTION_PROVENANCE_PROJECT;
1648 fe->funcvariadic = true;
1649 fe->funcresulttype = constants->OID_TYPE_UUID;
1650 fe->location = -1;
1651
1652 array->array_typeid = constants->OID_TYPE_INT_ARRAY;
1653 array->element_typeid = constants->OID_TYPE_INT;
1654 array->elements = NIL;
1655 array->location = -1;
1656
1657 for (r = 1; r <= (Index)q->rtable->length; ++r) {
1658 prov_offset[r] = cum;
1659 if (columns[r-1]) {
1660 RangeTblEntry *rte_r = (RangeTblEntry *)list_nth(q->rtable, r-1);
1661 int ncols = list_length(rte_r->eref->colnames);
1662 bool is_prov = false;
1663 int nb_user = 0;
1664 int k;
1665 for (k = 0; k < ncols; ++k) {
1666 if (columns[r-1][k] == -1) is_prov = true;
1667 else if (columns[r-1][k] > 0) nb_user++;
1668 }
1669 if (is_prov) cum += nb_user;
1670 }
1671 }
1672
1673 foreach (lc_v, q->targetList) {
1674 TargetEntry *te_v = (TargetEntry *)lfirst(lc_v);
1675 if (IsA(te_v->expr, Var)) {
1676 Var *vte_v = (Var *)te_v->expr;
1677 RangeTblEntry *rte_v =
1678 (RangeTblEntry *)lfirst(list_nth_cell(q->rtable, vte_v->varno - 1));
1679 int value_v;
1680#if PG_VERSION_NUM >= 180000
1681 if (rte_v->rtekind == RTE_GROUP) {
1682 Expr *ge = lfirst(list_nth_cell(rte_v->groupexprs, vte_v->varattno - 1));
1683 if(IsA(ge, Var)) {
1684 Var *v = (Var *) ge;
1685 value_v = columns[v->varno - 1] ?
1686 columns[v->varno - 1][v->varattno - 1] : 0;
1687 } else {
1688 Const *ce = makeConst(constants->OID_TYPE_INT, -1, InvalidOid,
1689 sizeof(int32), Int32GetDatum(0), false, true);
1690
1691 array->elements = lappend(array->elements, ce);
1692 value_v = 0;
1693 }
1694 } else
1695#endif
1696 if (rte_v->rtekind != RTE_JOIN) { // Normal RTE
1697 if (rte_v->rtekind == RTE_RELATION && columns[vte_v->varno - 1]) {
1698 /* Determine whether this base table is provenance-tracked by
1699 * scanning for the sentinel -1 entry that build_column_map()
1700 * assigns to the provsql column. */
1701 bool is_prov = false;
1702 int ncols_rte = list_length(rte_v->eref->colnames);
1703 for (int k = 0; k < ncols_rte; k++) {
1704 if (columns[vte_v->varno - 1][k] == -1) {
1705 is_prov = true;
1706 break;
1707 }
1708 }
1709 if (is_prov) {
1710 int raw = columns[vte_v->varno - 1][vte_v->varattno - 1];
1711 /* Local position within this table is `varattno` (the
1712 * provsql column is appended last by add_provenance(), so
1713 * user columns occupy 1..nb_user_cols exactly matching the
1714 * IN gate's Locator vector). We then shift by
1715 * prov_offset[varno] to land in the right slice of the
1716 * TIMES gate's concatenated locator vector when the query
1717 * joins multiple provenance-tracked relations. */
1718 value_v = (raw == -1) ? -1
1719 : (int)vte_v->varattno
1720 + prov_offset[vte_v->varno];
1721 } else {
1722 /* Non-provenance base table: no IN gate exists for it, so
1723 * the position would be out of range regardless. Explicitly
1724 * record 0 so evaluate() returns an empty locator set and
1725 * the positions array stays in sync with the output column
1726 * count. */
1727 Const *ce =
1728 makeConst(constants->OID_TYPE_INT, -1, InvalidOid,
1729 sizeof(int32), Int32GetDatum(0), false, true);
1730 array->elements = lappend(array->elements, ce);
1731 projection = true;
1732 continue;
1733 }
1734 } else {
1735 /* RTE_SUBQUERY and others: the sequential number equals the
1736 * column's 1-indexed position in the subquery's output list,
1737 * which matches what the child gate's evaluate() expects. */
1738 value_v = columns[vte_v->varno - 1] ?
1739 columns[vte_v->varno - 1][vte_v->varattno - 1] : 0;
1740 }
1741 } else { // Join RTE
1742 Var *jav_v = (Var *)lfirst(
1743 list_nth_cell(rte_v->joinaliasvars, vte_v->varattno - 1));
1744 if (jav_v && IsA(jav_v, Var) && columns[jav_v->varno - 1]) {
1745 RangeTblEntry *jrte_v = (RangeTblEntry *)lfirst(
1746 list_nth_cell(q->rtable, jav_v->varno - 1));
1747 if (jrte_v->rtekind == RTE_RELATION) {
1748 /* Provenance-tracking check and varattno fix — same rationale
1749 * as the RTE_RELATION branch above. */
1750 bool is_prov = false;
1751 int ncols_jrte = list_length(jrte_v->eref->colnames);
1752 for (int k = 0; k < ncols_jrte; k++) {
1753 if (columns[jav_v->varno - 1][k] == -1) {
1754 is_prov = true;
1755 break;
1756 }
1757 }
1758 if (is_prov) {
1759 int raw = columns[jav_v->varno - 1][jav_v->varattno - 1];
1760 value_v = (raw == -1) ? -1
1761 : (int)jav_v->varattno
1762 + prov_offset[jav_v->varno];
1763 } else {
1764 Const *ce =
1765 makeConst(constants->OID_TYPE_INT, -1, InvalidOid,
1766 sizeof(int32), Int32GetDatum(0), false, true);
1767 array->elements = lappend(array->elements, ce);
1768 projection = true;
1769 continue;
1770 }
1771 } else {
1772 value_v = columns[jav_v->varno - 1][jav_v->varattno - 1];
1773 }
1774 } else {
1775 value_v = 0;
1776 }
1777 }
1778
1779 /* If this is a valid column */
1780 if (value_v > 0) {
1781 Const *ce =
1782 makeConst(constants->OID_TYPE_INT, -1, InvalidOid, sizeof(int32),
1783 Int32GetDatum(value_v), false, true);
1784
1785 array->elements = lappend(array->elements, ce);
1786
1787 if (value_v != ++nb_column)
1788 projection = true;
1789 } else {
1790 if (value_v != -1)
1791 projection = true;
1792 }
1793 } else { // we have a function in target
1794 Const *ce = makeConst(constants->OID_TYPE_INT, -1, InvalidOid,
1795 sizeof(int32), Int32GetDatum(0), false, true);
1796
1797 array->elements = lappend(array->elements, ce);
1798 projection = true;
1799 }
1800 }
1801
1802 if (nb_column != nbcols)
1803 projection = true;
1804
1805 if (projection) {
1806 fe->args = list_make2(result, array);
1807 result = (Expr *)fe;
1808 } else {
1809 pfree(array);
1810 pfree(fe);
1811 }
1812 }
1813
1814 /* Wrap the finished per-row root in a @c gate_assumed_boolean when
1815 * our caller (the safe-query rewrite path in @c process_query) asks
1816 * for it. Wrapping here -- before @c add_to_select and
1817 * @c replace_provenance_function_by_expression -- means every
1818 * per-row root reference in the final target list carries the
1819 * marker uniformly. Subqueries that this same Query body opens
1820 * (per-atom DISTINCT projections inserted by the rewriter) are
1821 * handled by their own deeper @c process_query / @c make_provenance_expression
1822 * calls with @c wrap_assumed_boolean = false, so the marker sits
1823 * only at the outermost root that surfaces as the user-visible
1824 * row provenance. */
1825 if (wrap_assumed_boolean &&
1826 OidIsValid(constants->OID_FUNCTION_ASSUME_BOOLEAN))
1827 result = wrap_in_assume_boolean(constants, result);
1828
1829 return result;
1830}
1831
1832/* -------------------------------------------------------------------------
1833 * Set-operation & DISTINCT rewriting
1834 * ------------------------------------------------------------------------- */
1835
1836#if PG_VERSION_NUM >= 180000
1837typedef struct {
1838 Index group_rtindex;
1839 List *groupexprs;
1840} resolve_group_rte_ctx;
1841
1842static Node *
1843resolve_group_rte_vars_mutator(Node *node, void *raw_ctx) {
1844 resolve_group_rte_ctx *ctx = (resolve_group_rte_ctx *)raw_ctx;
1845 if (node == NULL)
1846 return NULL;
1847 if (IsA(node, Var)) {
1848 Var *v = (Var *)node;
1849 if (v->varno == ctx->group_rtindex) {
1850 Node *resolved = copyObject(list_nth(ctx->groupexprs, v->varattno - 1));
1851 /* Clear varnullingrels: the group-step nulling bits reference the
1852 * group_rtindex RTE which does not exist in the fresh inner query.
1853 * Leaving them set causes the planner to access simple_rel_array at
1854 * group_rtindex (which has no RelOptInfo), triggering
1855 * "unrecognized RTE kind: 9". */
1856 if (IsA(resolved, Var))
1857 ((Var *)resolved)->varnullingrels = NULL;
1858 return resolved;
1859 }
1860 }
1861 return expression_tree_mutator(node, resolve_group_rte_vars_mutator, raw_ctx);
1862}
1863
1864/**
1865 * @brief Strip PG 18's virtual @c RTE_GROUP entry from @p q in place.
1866 *
1867 * @c parseCheckAggregates() appends an @c RTE_GROUP entry at the end of
1868 * @c q->rtable whenever the query has a @c GROUP @c BY clause; references
1869 * to grouped columns in @c targetList and @c jointree->quals point at that
1870 * synthetic RTE rather than the underlying base tables. ProvSQL's
1871 * rewriters need a flat range-table to do their own index arithmetic, so
1872 * we remove the @c RTE_GROUP and resolve every @c Var(@c group_rtindex,
1873 * @c i) back to its base-table expression before going further.
1874 *
1875 * Idempotent: when @c q->hasGroupRTE is already false, returns without
1876 * doing anything.
1877 */
1878void strip_group_rte_pg18(Query *q) {
1879 resolve_group_rte_ctx grp_ctx;
1880 bool found = false;
1881 ListCell *lc;
1882 Index idx = 1;
1883 int rte_len = 0;
1884
1885 if (!q->hasGroupRTE)
1886 return;
1887
1888 foreach (lc, q->rtable) {
1889 RangeTblEntry *r = (RangeTblEntry *) lfirst(lc);
1890 if (r->rtekind == RTE_GROUP) {
1891 grp_ctx.group_rtindex = idx;
1892 grp_ctx.groupexprs = r->groupexprs;
1893 found = true;
1894 rte_len = idx - 1;
1895 break;
1896 }
1897 idx++;
1898 }
1899
1900 if (!found)
1901 return;
1902
1903 q->rtable = list_truncate(q->rtable, rte_len);
1904 q->hasGroupRTE = false;
1905
1906 foreach (lc, q->targetList) {
1907 TargetEntry *te = (TargetEntry *) lfirst(lc);
1908 te->expr = (Expr *) resolve_group_rte_vars_mutator(
1909 (Node *) te->expr, &grp_ctx);
1910 }
1911 if (q->jointree && q->jointree->quals)
1912 q->jointree->quals = resolve_group_rte_vars_mutator(
1913 q->jointree->quals, &grp_ctx);
1914}
1915#endif
1916
1917/* Forward declaration — defined later but needed by rewrite_agg_distinct */
1918static bool provenance_function_walker(Node *node, void *data);
1919
1920/**
1921 * @brief Build the inner GROUP-BY subquery for one @c AGG(DISTINCT key).
1922 *
1923 * Produces:
1924 * @code
1925 * SELECT key_expr, gb_col1, gb_col2, ...
1926 * FROM <same tables as q>
1927 * GROUP BY key_expr, gb_col1, gb_col2, ...
1928 * @endcode
1929 *
1930 * @param q Original query (supplies FROM / WHERE).
1931 * @param key_expr The DISTINCT argument expression.
1932 * @param groupby_tes Non-aggregate target entries that are GROUP BY columns.
1933 * @return Fresh inner @c Query.
1934 */
1935static Query *build_inner_for_distinct_key(Query *q, Expr *key_expr,
1936 List *groupby_tes) {
1937 Query *inner;
1938 List *new_tl = NIL;
1939 List *new_gc = NIL;
1940 ListCell *lc;
1941 int resno = 1, sgref = 1;
1942
1943 inner = copyObject(q);
1944
1945 inner->hasAggs = false;
1946 inner->sortClause = NIL;
1947 inner->limitCount = NULL;
1948 inner->limitOffset = NULL;
1949 inner->distinctClause = NIL;
1950 inner->hasDistinctOn = false;
1951 inner->havingQual = NULL;
1952
1953 /* First column: the DISTINCT key */
1954 {
1955 TargetEntry *kte = makeNode(TargetEntry);
1956 SortGroupClause *sgc = makeNode(SortGroupClause);
1957
1958 kte->expr = copyObject(key_expr);
1959 kte->resno = resno++;
1960 kte->resname = "key";
1961 sgc->tleSortGroupRef = kte->ressortgroupref = sgref++;
1962 get_sort_group_operators(exprType((Node *)kte->expr), true, true, false,
1963 &sgc->sortop, &sgc->eqop, NULL, &sgc->hashable);
1964 new_gc = list_make1(sgc);
1965 new_tl = list_make1(kte);
1966 }
1967
1968 /* Remaining columns: GROUP BY columns from the original query */
1969 foreach (lc, groupby_tes) {
1970 TargetEntry *gyte = copyObject((TargetEntry *)lfirst(lc));
1971 SortGroupClause *sgc = makeNode(SortGroupClause);
1972
1973 gyte->resno = resno++;
1974 gyte->resjunk = false;
1975 sgc->tleSortGroupRef = gyte->ressortgroupref = sgref++;
1976 get_sort_group_operators(exprType((Node *)gyte->expr), true, true, false,
1977 &sgc->sortop, &sgc->eqop, NULL, &sgc->hashable);
1978 new_gc = lappend(new_gc, sgc);
1979 new_tl = lappend(new_tl, gyte);
1980 }
1981
1982 inner->targetList = new_tl;
1983 inner->groupClause = new_gc;
1984 return inner;
1985}
1986
1987/**
1988 * @brief Wrap @p inner in an outer query that applies the original aggregate.
1989 *
1990 * Produces:
1991 * @code
1992 * SELECT AGG(key_col), gb_col1, gb_col2, ...
1993 * FROM inner
1994 * GROUP BY gb_col1, gb_col2, ...
1995 * @endcode
1996 * The DISTINCT flag is cleared; @p inner provides exactly one row per
1997 * (key, group-by) combination, so the plain aggregate gives the right count.
1998 *
1999 * @param orig_agg_te Original @c TargetEntry containing @c AGG(DISTINCT key).
2000 * @param inner Inner query from @c build_inner_for_distinct_key.
2001 * @param n_gb Number of GROUP BY columns (trailing entries in @p inner).
2002 * @param constants Extension OID cache.
2003 * @return Fresh outer @c Query.
2004 */
2005static Query *build_outer_for_distinct_key(TargetEntry *orig_agg_te,
2006 Query *inner, int n_gb,
2007 const constants_t *constants) {
2008 Query *outer = makeNode(Query);
2009 RangeTblEntry *rte = makeNode(RangeTblEntry);
2010 Alias *alias = makeNode(Alias), *eref = makeNode(Alias);
2011 RangeTblRef *rtr = makeNode(RangeTblRef);
2012 FromExpr *jt = makeNode(FromExpr);
2013 List *new_tl = NIL, *new_gc = NIL;
2014 ListCell *lc;
2015 int resno = 1, sgref = 1;
2016 int inner_len = list_length(inner->targetList);
2017 int attno;
2018
2019 /* Wrap inner in a subquery RTE */
2020 alias->aliasname = eref->aliasname = "d";
2021 eref->colnames = NIL;
2022 foreach (lc, inner->targetList) {
2023 TargetEntry *te = lfirst(lc);
2024 eref->colnames = lappend(eref->colnames,
2025 makeString(te->resname ? pstrdup(te->resname) : ""));
2026 }
2027 rte->alias = alias;
2028 rte->eref = eref;
2029 rte->rtekind = RTE_SUBQUERY;
2030 rte->subquery = inner;
2031 rte->inFromCl = true;
2032#if PG_VERSION_NUM < 160000
2033 rte->requiredPerms = ACL_SELECT;
2034#endif
2035
2036 rtr->rtindex = 1;
2037 jt->fromlist = list_make1(rtr);
2038
2039 outer->commandType = CMD_SELECT;
2040 outer->canSetTag = true;
2041 outer->rtable = list_make1(rte);
2042 outer->jointree = jt;
2043 outer->hasAggs = true;
2044
2045 /* First output column: the aggregate over the key (col 1 of inner) */
2046 {
2047 TargetEntry *agg_te = copyObject(orig_agg_te);
2048 Aggref *ar = (Aggref *)agg_te->expr;
2049 Var *key_var = makeNode(Var);
2050 TargetEntry *arg_te = makeNode(TargetEntry);
2051
2052 key_var->varno = 1;
2053 key_var->varattno = 1; /* key is first column of inner */
2054 key_var->vartype = linitial_oid(ar->aggargtypes);
2055 key_var->varcollid = exprCollation((Node *)((TargetEntry *)linitial(ar->args))->expr);
2056 key_var->vartypmod = -1;
2057 key_var->location = -1;
2058 arg_te->resno = 1;
2059 arg_te->expr = (Expr *)key_var;
2060
2061 ar->args = list_make1(arg_te);
2062 ar->aggdistinct = NIL;
2063 agg_te->resno = resno++;
2064 new_tl = list_make1(agg_te);
2065 }
2066
2067 /* Remaining output columns: GROUP BY cols (trailing cols of inner) */
2068 for (attno = inner_len - n_gb + 1; attno <= inner_len; attno++) {
2069 TargetEntry *inner_te = list_nth(inner->targetList, attno - 1);
2070 Var *gb_var = makeNode(Var);
2071 TargetEntry *gb_te = makeNode(TargetEntry);
2072 SortGroupClause *sgc = makeNode(SortGroupClause);
2073
2074 gb_var->varno = 1;
2075 gb_var->varattno = attno;
2076 gb_var->vartype = exprType((Node *)inner_te->expr);
2077 gb_var->varcollid = exprCollation((Node *)inner_te->expr);
2078 gb_var->vartypmod = -1;
2079 gb_var->location = -1;
2080
2081 gb_te->resno = resno++;
2082 gb_te->expr = (Expr *)gb_var;
2083 gb_te->resname = inner_te->resname;
2084
2085 sgc->tleSortGroupRef = gb_te->ressortgroupref = sgref++;
2086 sgc->nulls_first = false;
2087 get_sort_group_operators(gb_var->vartype, true, true, false,
2088 &sgc->sortop, &sgc->eqop, NULL, &sgc->hashable);
2089 new_gc = lappend(new_gc, sgc);
2090 new_tl = lappend(new_tl, gb_te);
2091 }
2092
2093 outer->targetList = new_tl;
2094 outer->groupClause = new_gc;
2095 return outer;
2096}
2097
2098/**
2099 * @brief Rewrite every @c AGG(DISTINCT key) in @p q using independent subqueries.
2100 *
2101 * For a single DISTINCT aggregate, produces a subquery:
2102 * @code
2103 * SELECT AGG(key), gb... FROM (SELECT key, gb... FROM t GROUP BY key, gb...) GROUP BY gb...
2104 * @endcode
2105 * For multiple DISTINCT aggregates with different keys, produces an JOIN
2106 * of one such subquery per aggregate, joined on the GROUP BY columns.
2107 * Non-DISTINCT aggregates are left untouched.
2108 *
2109 * @param q Query to inspect and possibly rewrite.
2110 * @param constants Extension OID cache.
2111 * @return Rewritten query, or @c NULL if no @c AGG(DISTINCT) was found.
2112 */
2113static Query *rewrite_agg_distinct(Query *q, const constants_t *constants) {
2114 List *distinct_agg_tes = NIL;
2115 List *groupby_tes = NIL;
2116 ListCell *lc;
2117
2118#if PG_VERSION_NUM >= 180000
2119 /* In PostgreSQL 18, parseCheckAggregates() injects a virtual RTE_GROUP
2120 * entry at the END of the range table. GROUP BY column Vars in the
2121 * SELECT list point to this entry (varno == group_rtindex) instead of
2122 * the underlying base-table RTE.
2123 *
2124 * Strip that entry now, before we do any index arithmetic (fll, rtr->rtindex,
2125 * agg_idx) or copy q->targetList into groupby_tes. Once removed:
2126 * - q->rtable contains only real RTEs, so appending outer-subquery RTEs
2127 * lands at the correct indices.
2128 * - groupby_tes will carry resolved (base-table) Var expressions, so
2129 * the WHERE equalities and the inner-query target list are correct.
2130 * We also resolve the Var(group_rtindex) refs in q's own targetList and
2131 * WHERE clause so the final query doesn't reference the stripped entry. */
2133#endif
2134
2135 /* Extract AGG(DISTINCT) and GROUP BY targets from the target list.
2136 * Regular AGG() aggregations and expressions containing provenance()
2137 * are left untouched. */
2138 foreach (lc, q->targetList) {
2139 TargetEntry *te = lfirst(lc);
2140 if (IsA(te->expr, Aggref)) {
2141 Aggref *ar = (Aggref *)te->expr;
2142 if (list_length(ar->aggdistinct) > 0)
2143 distinct_agg_tes = lappend(distinct_agg_tes, te);
2144 } else if (provenance_function_walker((Node *)te->expr,
2145 (void *)constants)) {
2146 /* Expression contains provenance() — skip it, it will be
2147 * handled later by the provenance rewriter */
2148 } else {
2149 /* Non-aggregate column — treat as GROUP BY key */
2150 TargetEntry *te_copy = copyObject(te);
2151 te_copy->resjunk = false;
2152 groupby_tes = lappend(groupby_tes, te_copy);
2153 }
2154 }
2155
2156 if (distinct_agg_tes == NIL)
2157 return NULL;
2158
2159 {
2160 int n_aggs = list_length(distinct_agg_tes);
2161 int n_gb = list_length(groupby_tes);
2162 List *outer_queries = NIL;
2163
2164 /* -----------------------------------------------------------------------
2165 * For each DISTINCT aggregate, build:
2166 * inner_i: SELECT key_i, gb... FROM original... GROUP BY key_i, gb...
2167 * outer_i: SELECT AGG(key_i) ASS agg_i, gb... FROM inner_i GROUP BY gb...
2168 *
2169 * Then produce a final query:
2170 * SELECT gb..., agg_0, ..., agg_{N-1}
2171 * FROM original... JOIN outer_0 ON gb... = gb... [JOIN ...]
2172 * keeping the same order for the output columns.
2173 *
2174 * Column order in the final target list follows q->targetList:
2175 * - DISTINCT agg i → Var(n+i, 1) (agg col of outer_i)
2176 * ----------------------------------------------------------------------- */
2177
2178 /* Build one inner + one outer query per DISTINCT aggregate */
2179 foreach (lc, distinct_agg_tes) {
2180 TargetEntry *agg_te = lfirst(lc);
2181 Aggref *ar = (Aggref *)agg_te->expr;
2182 if(list_length(ar->args) != 1)
2183 provsql_error("AGG(DISTINCT) with more than one argument is not supported");
2184 else {
2185 Expr *key_expr = (Expr *)((TargetEntry *)linitial(ar->args))->expr;
2186 Query *inner = build_inner_for_distinct_key(q, key_expr, groupby_tes);
2187 Query *outer = build_outer_for_distinct_key(agg_te, inner, n_gb, constants);
2188 outer_queries = lappend(outer_queries, outer);
2189 }
2190 }
2191
2192 {
2193 /* One subquery RTE per outer query */
2194 int i = 0;
2195 foreach (lc, outer_queries) {
2196 Query *oq = lfirst(lc);
2197 RangeTblEntry *rte = makeNode(RangeTblEntry);
2198 Alias *alias = makeNode(Alias), *eref = makeNode(Alias);
2199 ListCell *lc2;
2200 char buf[16];
2201
2202 snprintf(buf, sizeof(buf), "d%d", i + 1);
2203 alias->aliasname = eref->aliasname = pstrdup(buf);
2204 eref->colnames = NIL;
2205 foreach (lc2, oq->targetList) {
2206 TargetEntry *te = lfirst(lc2);
2207 eref->colnames = lappend(eref->colnames,
2208 makeString(te->resname ? pstrdup(te->resname) : ""));
2209 }
2210 rte->alias = alias;
2211 rte->eref = eref;
2212 rte->rtekind = RTE_SUBQUERY;
2213 rte->subquery = oq;
2214 rte->inFromCl = true;
2215#if PG_VERSION_NUM < 160000
2216 rte->requiredPerms = ACL_SELECT;
2217#endif
2218 q->rtable = lappend(q->rtable, rte);
2219 i++;
2220 }
2221
2222 /* Build FROM list and WHERE conditions for the implicit join.
2223 * Use a simple FROM original..., outer_i, ... WHERE original.gb_j = outer_i.gb_j */
2224 {
2225 FromExpr *jt = q->jointree;
2226 List *from_list = jt->fromlist;
2227 unsigned fll = list_length(from_list);
2228 List *where_args = NIL;
2229
2230 for (i = fll+1; i <= fll+n_aggs; i++) {
2231 RangeTblRef *rtr = makeNode(RangeTblRef);
2232 ListCell *lc2;
2233 unsigned j=0;
2234
2235 rtr->rtindex = i;
2236 from_list = lappend(from_list, rtr);
2237
2238 /* outer_0.gb_j = outer_i.gb_j for each GROUP BY column j */
2239 foreach(lc2, groupby_tes) {
2240 TargetEntry *gb_te = lfirst(lc2);
2241 int gb_attno = ++j + 1; /* col 1 = agg, cols 2+ = GB */
2242 Oid ytype = exprType((Node *)gb_te->expr);
2243 Oid opno = find_equality_operator(ytype, ytype);
2244 Operator opInfo = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno));
2245 Form_pg_operator opform;
2246 OpExpr *oe = makeNode(OpExpr);
2247 Expr *le = copyObject(gb_te->expr);
2248 Var *rv = makeNode(Var);
2249 Oid collation=exprCollation((Node*) le);
2250
2251 if (!HeapTupleIsValid(opInfo))
2252 provsql_error("could not find equality operator for type %u",
2253 ytype);
2254 opform = (Form_pg_operator)GETSTRUCT(opInfo);
2255
2256 oe->opno = opno;
2257 oe->opfuncid = opform->oprcode;
2258 oe->opresulttype = opform->oprresult;
2259 oe->opcollid = InvalidOid;
2260 oe->inputcollid = collation;
2261 oe->location = -1;
2262 ReleaseSysCache(opInfo);
2263
2264 rv->varno = i; rv->varattno = gb_attno;
2265 rv->vartype = ytype; rv->varcollid = collation;
2266 rv->vartypmod = -1; rv->location = -1;
2267
2268 oe->args = list_make2(le, rv);
2269 where_args = lappend(where_args, oe);
2270 }
2271 }
2272
2273 if (list_length(where_args) == 0) {
2274 jt->quals = NULL;
2275 } else if (list_length(where_args) == 1) {
2276 jt->quals = linitial(where_args);
2277 } else {
2278 BoolExpr *be = makeNode(BoolExpr);
2279 be->boolop = AND_EXPR;
2280 be->args = where_args;
2281 be->location = -1;
2282 jt->quals = (Node *)be;
2283 }
2284 }
2285
2286 /* Build final target list in original column order.
2287 * DISTINCT agg i → Var(i+1, 1); GROUP BY col j → Var(1, 2+j). */
2288 {
2289 int agg_idx = list_length(q->jointree->fromlist) - n_aggs + 1;
2290 ListCell *lc2;
2291
2292 foreach (lc2, q->targetList) {
2293 TargetEntry *te = lfirst(lc2);
2294
2295 if (IsA(te->expr, Aggref) &&
2296 ((Aggref *)te->expr)->aggdistinct != NIL) {
2297 Var *v = makeNode(Var);
2298 v->varno = agg_idx++; /* outer_{agg_idx} RTE */
2299 v->varattno = 1; /* agg result is col 1 of each outer */
2300 v->vartypmod = -1;
2301 v->location = -1;
2302 te->expr = (Expr*)v;
2303 }
2304 }
2305 }
2306
2307 return q;
2308 }
2309 }
2310}
2311
2312
2313/* -------------------------------------------------------------------------
2314 * Aggregation replacement mutator
2315 * ------------------------------------------------------------------------- */
2316
2317/** @brief Context for the @c aggregation_mutator tree walker. */
2319 List *prov_atts; ///< List of provenance Var nodes
2320 semiring_operation op; ///< Semiring operation for combining tokens
2321 const constants_t *constants; ///< Extension OID cache
2323
2324/**
2325 * @brief Tree-mutator that replaces Aggrefs with provenance-aware aggregates.
2326 * @param node Current expression tree node.
2327 * @param ctx Pointer to an @c aggregation_mutator_context (prov_atts,
2328 * op, and constants).
2329 * @return Possibly modified node.
2330 */
2331static Node *aggregation_mutator(Node *node, void *ctx) {
2333 if (node == NULL)
2334 return NULL;
2335
2336 if (IsA(node, Aggref)) {
2337 Aggref *ar_v = (Aggref *)node;
2338 return (Node *)make_aggregation_expression(context->constants, ar_v,
2339 context->prov_atts, context->op);
2340 }
2341
2342 return expression_tree_mutator(node, aggregation_mutator, ctx);
2343}
2344
2345/**
2346 * @brief Wrap a @c provenance_aggregate FuncExpr with a cast to the
2347 * original aggregate return type.
2348 *
2349 * @param prov_agg The provenance_aggregate FuncExpr to wrap.
2350 * @param constants Extension OID cache.
2351 * @return Cast FuncExpr wrapping @p prov_agg.
2352 */
2353static Node *wrap_agg_token_with_cast(FuncExpr *prov_agg,
2354 const constants_t *constants) {
2355 Const *typ_const = (Const *)lsecond(prov_agg->args);
2356 Oid target_type = DatumGetObjectId(typ_const->constvalue);
2357 CoercionPathType pathtype;
2358 Oid castfuncid;
2359
2360 pathtype = find_coercion_pathway(target_type,
2361 constants->OID_TYPE_AGG_TOKEN,
2362 COERCION_EXPLICIT, &castfuncid);
2363 if (pathtype == COERCION_PATH_FUNC && OidIsValid(castfuncid)) {
2364 FuncExpr *cast = makeNode(FuncExpr);
2365 cast->funcid = castfuncid;
2366 cast->funcresulttype = target_type;
2367 cast->funcretset = false;
2368 cast->funcvariadic = false;
2369 cast->funcformat = COERCE_IMPLICIT_CAST;
2370 cast->args = list_make1(prov_agg);
2371 cast->location = -1;
2372 return (Node *)cast;
2373 }
2374
2375 provsql_error("no cast from agg_token to %s for arithmetic on aggregate",
2376 format_type_be(target_type));
2377 return (Node *)prov_agg; /* unreachable */
2378}
2379
2380/**
2381 * @brief Cast @c provenance_aggregate arguments of an operator or
2382 * function when the formal parameter type requires it.
2383 *
2384 * For each argument in @p args that is a @c provenance_aggregate call,
2385 * check the corresponding formal parameter type of the parent function
2386 * @p parent_funcid. If the formal type is polymorphic or @c agg_token
2387 * itself, the argument is left alone. Otherwise a cast to the original
2388 * aggregate return type is inserted.
2389 *
2390 * @param args Argument list to inspect (modified in place).
2391 * @param parent_funcid OID of the parent function / operator implementor.
2392 * @param constants Extension OID cache.
2393 */
2394static void maybe_cast_agg_token_args(List *args, Oid parent_funcid,
2395 const constants_t *constants) {
2396 HeapTuple tp;
2397 Form_pg_proc procForm;
2398 ListCell *lc;
2399 int i;
2400
2401 tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(parent_funcid));
2402 if (!HeapTupleIsValid(tp))
2403 return;
2404 procForm = (Form_pg_proc) GETSTRUCT(tp);
2405
2406 i = 0;
2407 foreach(lc, args) {
2408 Node *arg = lfirst(lc);
2409
2410 if (i < procForm->pronargs && IsA(arg, FuncExpr) &&
2411 ((FuncExpr *)arg)->funcid == constants->OID_FUNCTION_PROVENANCE_AGGREGATE) {
2412 Oid formal_type = procForm->proargtypes.values[i];
2413
2414 if (formal_type != constants->OID_TYPE_AGG_TOKEN &&
2415 !IsPolymorphicType(formal_type)) {
2416 lfirst(lc) = wrap_agg_token_with_cast((FuncExpr *)arg, constants);
2417 }
2418 }
2419 i++;
2420 }
2421
2422 ReleaseSysCache(tp);
2423}
2424
2425/**
2426 * @brief Tree-mutator that casts @c provenance_aggregate results back
2427 * to the original aggregate return type where needed.
2428 *
2429 * After the aggregation mutator replaces Aggrefs with
2430 * @c provenance_aggregate calls (returning @c agg_token), this
2431 * post-processing step inserts casts where the surrounding expression
2432 * expects a different type (e.g. @c SUM(id)+1). Arguments to
2433 * functions that accept @c agg_token or polymorphic types are left
2434 * alone.
2435 *
2436 * @param node Current expression tree node.
2437 * @param ctx Pointer to the @c constants_t OID cache.
2438 * @return Possibly modified node.
2439 */
2440static Node *cast_agg_token_mutator(Node *node, void *ctx) {
2441 const constants_t *constants = (const constants_t *)ctx;
2442 Node *result;
2443
2444 if (node == NULL)
2445 return NULL;
2446
2447 /* Recurse first, then fix up arguments at this level. */
2448 result = expression_tree_mutator(node, cast_agg_token_mutator, ctx);
2449
2450 if (IsA(result, OpExpr)) {
2451 OpExpr *op = (OpExpr *)result;
2452 set_opfuncid(op);
2453 maybe_cast_agg_token_args(op->args, op->opfuncid, constants);
2454 } else if (IsA(result, FuncExpr)) {
2455 FuncExpr *fe = (FuncExpr *)result;
2456 if (fe->funcid != constants->OID_FUNCTION_PROVENANCE_AGGREGATE)
2457 maybe_cast_agg_token_args(fe->args, fe->funcid, constants);
2458 }
2459
2460 return result;
2461}
2462
2463/**
2464 * @brief Replace every @c Aggref in @p q with a provenance-aware aggregate.
2465 *
2466 * Walks the query tree and substitutes each @c Aggref node with the result
2467 * of @c make_aggregation_expression, which wraps the original aggregate in
2468 * the semimodule machinery (@c provenance_semimod + @c array_agg +
2469 * @c provenance_aggregate).
2470 *
2471 * @param constants Extension OID cache.
2472 * @param q Query to mutate in place.
2473 * @param prov_atts List of provenance @c Var nodes.
2474 * @param op Semiring operation for combining tokens across rows.
2475 */
2476static void
2478 Query *q, List *prov_atts,
2479 semiring_operation op) {
2480
2481 aggregation_mutator_context context = {prov_atts, op, constants};
2482 ListCell *lc;
2483
2484 query_tree_mutator(q, aggregation_mutator, &context,
2485 QTW_DONT_COPY_QUERY | QTW_IGNORE_RT_SUBQUERIES);
2486
2487 /* Post-processing: for target-list entries where a provenance_aggregate
2488 * result is nested inside an outer expression (e.g. SUM(id)+1),
2489 * insert a cast from agg_token back to the original aggregate return
2490 * type. Standalone provenance_aggregate entries are left as agg_token
2491 * so they display as "value (*)". */
2492 foreach(lc, q->targetList) {
2493 TargetEntry *te = (TargetEntry *)lfirst(lc);
2494 if (te->expr == NULL)
2495 continue;
2496 /* Skip standalone provenance_aggregate calls */
2497 if (IsA(te->expr, FuncExpr) &&
2498 ((FuncExpr *)te->expr)->funcid == constants->OID_FUNCTION_PROVENANCE_AGGREGATE)
2499 continue;
2500 te->expr = (Expr *)cast_agg_token_mutator((Node *)te->expr,
2501 (void *)constants);
2502 }
2503}
2504
2505/**
2506 * @brief Append the provenance expression to @p q's target list.
2507 *
2508 * Inserts a new @c TargetEntry named @c provsql immediately before any
2509 * @c resjunk entries (which must remain last) and adjusts the @c resno
2510 * of subsequent entries accordingly.
2511 *
2512 * @param q Query to modify in place.
2513 * @param provenance Expression to add (becomes the @c provsql output column).
2514 */
2515static void add_to_select(Query *q, Expr *provenance) {
2516 TargetEntry *newte = makeNode(TargetEntry);
2517 bool inserted = false;
2518 unsigned resno = 0;
2519
2520 newte->expr = provenance;
2521 newte->resname = (char *)PROVSQL_COLUMN_NAME;
2522
2523 if (IsA(provenance, Var)) {
2524 RangeTblEntry *rte = list_nth(q->rtable, ((Var *)provenance)->varno - 1);
2525 newte->resorigtbl = rte->relid;
2526 newte->resorigcol = ((Var *)provenance)->varattno;
2527 }
2528
2529 /* Make sure to insert before all resjunk Target Entry */
2530 for (ListCell *cell = list_head(q->targetList); cell != NULL;) {
2531 TargetEntry *te = (TargetEntry *)lfirst(cell);
2532
2533 if (!inserted)
2534 ++resno;
2535
2536 if (te->resjunk) {
2537 if (!inserted) {
2538 newte->resno = resno;
2539 q->targetList = list_insert_nth(q->targetList, resno - 1, newte);
2540 cell = list_nth_cell(q->targetList, resno);
2541 te = (TargetEntry *)lfirst(cell);
2542 inserted = true;
2543 }
2544
2545 ++te->resno;
2546 }
2547
2548 cell = my_lnext(q->targetList, cell);
2549 }
2550
2551 if (!inserted) {
2552 newte->resno = resno + 1;
2553 q->targetList = lappend(q->targetList, newte);
2554 }
2555}
2556
2557/* -------------------------------------------------------------------------
2558 * Provenance function replacement
2559 * ------------------------------------------------------------------------- */
2560
2561/** @brief Context for the @c provenance_mutator tree walker. */
2563 Expr *provsql; ///< Provenance expression to substitute for provenance() calls
2564 const constants_t *constants; ///< Extension OID cache
2565 bool provsql_has_aggref; ///< @c true when @c provsql contains an @c Aggref (set once by @c replace_provenance_function_by_expression). When @c true, a @c provenance() substitution that lands inside another @c Aggref's argument tree would produce a nested same-level aggregate -- @c parse_agg.c forbids that shape, the planner's @c preprocess_aggrefs_walker does not recurse through @c Aggref boundaries, and the inner @c Aggref's @c aggno stays at the @c -1 sentinel and crashes @c ExecInterpExpr on @c ecxt_aggvalues[-1].
2566 bool inside_aggref; ///< @c true while descending the argument tree of an @c Aggref node.
2568
2569/**
2570 * @brief @c expression_tree_walker predicate: returns @c true on the first
2571 * @c Aggref it encounters.
2572 *
2573 * Used to decide whether the provenance expression about to be substituted
2574 * would inject a nested aggregate when a @c provenance() call lives inside
2575 * another @c Aggref's argument tree.
2576 */
2577static bool
2578expr_contains_aggref_walker(Node *node, void *context) {
2579 if (node == NULL)
2580 return false;
2581 if (IsA(node, Aggref))
2582 return true;
2583 return expression_tree_walker(node, expr_contains_aggref_walker, context);
2584}
2585
2586/**
2587 * @brief Tree-mutator that replaces provenance() calls with the actual provenance expression.
2588 * @param node Current expression tree node.
2589 * @param ctx Pointer to a @c provenance_mutator_context (provenance
2590 * expression and constants).
2591 * @return Possibly modified node.
2592 */
2593static Node *provenance_mutator(Node *node, void *ctx) {
2595 if (node == NULL)
2596 return NULL;
2597
2598 if (IsA(node, Aggref)) {
2599 /* Descend into the Aggref's arguments with @c inside_aggref set so we
2600 * can refuse substitutions that would create a nested same-level
2601 * aggregate. Save and restore the flag so sibling sub-expressions
2602 * outside this Aggref see the original value. */
2603 bool saved = context->inside_aggref;
2604 Node *result;
2605 context->inside_aggref = true;
2606 result = expression_tree_mutator(node, provenance_mutator, ctx);
2607 context->inside_aggref = saved;
2608 return result;
2609 }
2610
2611 if (IsA(node, FuncExpr)) {
2612 FuncExpr *f = (FuncExpr *)node;
2613
2614 if (f->funcid == context->constants->OID_FUNCTION_PROVENANCE) {
2615 if (context->inside_aggref && context->provsql_has_aggref) {
2617 "applying an SQL aggregate on top of a ProvSQL-introduced "
2618 "aggregation is not supported: the inner provenance() would "
2619 "be substituted with an expression containing an aggregate, "
2620 "producing a nested same-level aggregate that PostgreSQL "
2621 "rejects. Evaluate the per-row provenance in a subquery "
2622 "and aggregate the resulting scalar outside, or drop the "
2623 "surrounding aggregate.");
2624 }
2625 return (Node *)copyObject(context->provsql);
2626 }
2627 } else if (IsA(node, RangeTblEntry) || IsA(node, RangeTblFunction)) {
2628 // A provenance() expression in a From (not within a subquery) is
2629 // non-sensical
2630 return node;
2631 }
2632
2633 return expression_tree_mutator(node, provenance_mutator, ctx);
2634}
2635
2636/**
2637 * @brief Replace every explicit @c provenance() call in @p q with @p provsql.
2638 *
2639 * Users can write @c provenance() in the target list or WHERE to refer to the
2640 * provenance token of the current tuple. This mutator substitutes those calls
2641 * with the actual computed provenance expression.
2642 *
2643 * @param constants Extension OID cache.
2644 * @param q Query to mutate in place.
2645 * @param provsql Provenance expression to substitute.
2646 */
2647static void
2649 Query *q, Expr *provsql) {
2651
2652 context.provsql = provsql;
2653 context.constants = constants;
2654 context.provsql_has_aggref =
2655 expr_contains_aggref_walker((Node *) provsql, NULL);
2656 context.inside_aggref = false;
2657
2658 query_tree_mutator(q, provenance_mutator, &context,
2659 QTW_DONT_COPY_QUERY | QTW_IGNORE_RT_SUBQUERIES);
2660}
2661
2662/**
2663 * @brief Convert a SELECT DISTINCT into an equivalent GROUP BY.
2664 *
2665 * ProvSQL cannot handle DISTINCT directly (it would collapse provenance
2666 * tokens that should remain separate). This function moves every entry
2667 * from @p q->distinctClause into @p q->groupClause (skipping any that are
2668 * already there) and clears @p q->distinctClause.
2669 *
2670 * @param q Query to modify in place.
2671 */
2673 // First check which are already in the group by clause
2674 // Should be either none or all as "SELECT DISTINCT a, b ... GROUP BY a"
2675 // is invalid
2676 Bitmapset *already_in_group_by = NULL;
2677 ListCell *lc;
2678 foreach (lc, q->groupClause) {
2679 SortGroupClause *sgc = (SortGroupClause *)lfirst(lc);
2680 already_in_group_by =
2681 bms_add_member(already_in_group_by, sgc->tleSortGroupRef);
2682 }
2683
2684 foreach (lc, q->distinctClause) {
2685 SortGroupClause *sgc = (SortGroupClause *)lfirst(lc);
2686 if (!bms_is_member(sgc->tleSortGroupRef, already_in_group_by)) {
2687 q->groupClause = lappend(q->groupClause, sgc);
2688 }
2689 }
2690
2691 q->distinctClause = NULL;
2692}
2693
2694/**
2695 * @brief Remove sort/group references that belonged to removed provenance columns.
2696 *
2697 * After @c remove_provenance_attributes_select strips provenance entries from
2698 * the target list, any GROUP BY, ORDER BY, or DISTINCT clause that referenced
2699 * them by @c tleSortGroupRef must be cleaned up.
2700 *
2701 * @param q Query to modify in place.
2702 * @param removed_sortgrouprefs Bitmapset of @c ressortgroupref values to remove.
2703 */
2704static void
2706 const Bitmapset *removed_sortgrouprefs) {
2707 List **lists[3] = {&q->groupClause, &q->distinctClause, &q->sortClause};
2708 int i = 0;
2709
2710 for (i = 0; i < 3; ++i) {
2711 ListCell *cell, *prev;
2712
2713 for (cell = list_head(*lists[i]), prev = NULL; cell != NULL;) {
2714 SortGroupClause *sgc = (SortGroupClause *)lfirst(cell);
2715 if (bms_is_member(sgc->tleSortGroupRef, removed_sortgrouprefs)) {
2716 *lists[i] = my_list_delete_cell(*lists[i], cell, prev);
2717
2718 if (prev) {
2719 cell = my_lnext(*lists[i], prev);
2720 } else {
2721 cell = list_head(*lists[i]);
2722 }
2723 } else {
2724 prev = cell;
2725 cell = my_lnext(*lists[i], cell);
2726 }
2727 }
2728 }
2729}
2730
2731/**
2732 * @brief Strip the provenance column's type info from a set-operation node.
2733 *
2734 * When a provenance column is removed from a UNION/EXCEPT query's target list,
2735 * the matching entries in the @c SetOperationStmt's @c colTypes, @c colTypmods,
2736 * and @c colCollations lists must also be removed.
2737 *
2738 * @param q Query containing @c setOperations.
2739 * @param removed Boolean array (from @c remove_provenance_attributes_select)
2740 * indicating which columns were removed.
2741 */
2742static void remove_provenance_attribute_setoperations(Query *q, bool *removed) {
2743 SetOperationStmt *so = (SetOperationStmt *)q->setOperations;
2744 List **lists[3] = {&so->colTypes, &so->colTypmods, &so->colCollations};
2745 int i = 0;
2746
2747 for (i = 0; i < 3; ++i) {
2748 ListCell *cell, *prev;
2749 int j;
2750
2751 for (cell = list_head(*lists[i]), prev = NULL, j = 0; cell != NULL; ++j) {
2752 if (removed[j]) {
2753 *lists[i] = my_list_delete_cell(*lists[i], cell, prev);
2754
2755 if (prev) {
2756 cell = my_lnext(*lists[i], prev);
2757 } else {
2758 cell = list_head(*lists[i]);
2759 }
2760 } else {
2761 prev = cell;
2762 cell = my_lnext(*lists[i], cell);
2763 }
2764 }
2765 }
2766}
2767
2768/**
2769 * @brief Wrap a non-ALL set operation in an outer GROUP BY query.
2770 *
2771 * UNION / EXCEPT (without ALL) would deduplicate tuples before ProvSQL can
2772 * attach provenance tokens. To avoid this, the set operation is converted to
2773 * UNION ALL / EXCEPT ALL and a new outer query is built that groups the results
2774 * by all non-provenance columns, collecting tokens into an array for the
2775 * @c provenance_plus evaluation.
2776 *
2777 * After this rewrite the recursive call to @c process_query handles the
2778 * now-ALL inner set operation normally.
2779 *
2780 * @param q Query whose @c setOperations is non-ALL (modified to ALL in place).
2781 * @return New outer query that wraps @p q as a subquery RTE.
2782 */
2784 Query *new_query = makeNode(Query);
2785 RangeTblEntry *rte = makeNode(RangeTblEntry);
2786 FromExpr *jointree = makeNode(FromExpr);
2787 RangeTblRef *rtr = makeNode(RangeTblRef);
2788
2789 SetOperationStmt *stmt = (SetOperationStmt *)q->setOperations;
2790
2791 ListCell *lc;
2792 int sortgroupref = 0;
2793
2794 stmt->all = true;
2795 // we might leave sub nodes of the SetOperationsStmt tree with all = false
2796 // but only for recursive trees of operators and only union can be recursive
2797 // https://doxygen.postgresql.org/prepunion_8c_source.html#l00479
2798 // we will set therefore set them later in process_set_operation_union
2799
2800 rte->rtekind = RTE_SUBQUERY;
2801 rte->subquery = q;
2802 rte->eref = copyObject(((RangeTblEntry *)linitial(q->rtable))->eref);
2803 rte->inFromCl = true;
2804#if PG_VERSION_NUM < 160000
2805 // For PG_VERSION_NUM >= 160000, rte->perminfoindex==0 so no need to
2806 // care about permissions
2807 rte->requiredPerms = ACL_SELECT;
2808#endif
2809
2810 rtr->rtindex = 1;
2811 jointree->fromlist = list_make1(rtr);
2812
2813 new_query->commandType = CMD_SELECT;
2814 new_query->canSetTag = true;
2815 new_query->rtable = list_make1(rte);
2816 new_query->jointree = jointree;
2817 new_query->targetList = copyObject(q->targetList);
2818
2819 if (new_query->targetList) {
2820 foreach (lc, new_query->targetList) {
2821 TargetEntry *te = (TargetEntry *)lfirst(lc);
2822 SortGroupClause *sgc = makeNode(SortGroupClause);
2823
2824 sgc->tleSortGroupRef = te->ressortgroupref = ++sortgroupref;
2825
2826 get_sort_group_operators(exprType((Node *)te->expr), false, true, false,
2827 &sgc->sortop, &sgc->eqop, NULL, &sgc->hashable);
2828
2829 new_query->groupClause = lappend(new_query->groupClause, sgc);
2830 }
2831 } else {
2832 GroupingSet *gs = makeNode(GroupingSet);
2833 gs->kind = GROUPING_SET_EMPTY;
2834 gs->content = 0;
2835 gs->location = -1;
2836 new_query->groupingSets = list_make1(gs);
2837 }
2838
2839 return new_query;
2840}
2841
2842/* -------------------------------------------------------------------------
2843 * Detection walkers
2844 * ------------------------------------------------------------------------- */
2845
2846/**
2847 * @brief Tree walker that returns true if any @c provenance() call is found.
2848 *
2849 * Used to detect whether a query explicitly calls @c provenance(), which
2850 * triggers the substitution in @c replace_provenance_function_by_expression.
2851 * @param node Current expression tree node.
2852 * @param data Pointer to @c constants_t (cast from @c void*).
2853 * @return @c true if a @c provenance() call is found anywhere in @p node.
2854 */
2855static bool provenance_function_walker(Node *node, void *data) {
2856 const constants_t *constants = (const constants_t *)data;
2857 if (node == NULL)
2858 return false;
2859
2860 if (IsA(node, FuncExpr)) {
2861 FuncExpr *f = (FuncExpr *)node;
2862
2863 if (f->funcid == constants->OID_FUNCTION_PROVENANCE)
2864 return true;
2865 }
2866
2867 return expression_tree_walker(node, provenance_function_walker, data);
2868}
2869
2870/**
2871 * @brief Check whether a @c provenance() call appears in the GROUP BY list.
2872 *
2873 * When the user writes @c GROUP BY provenance(), ProvSQL must not add its own
2874 * group-by wrapper (the query is already grouping on the token).
2875 *
2876 * @param constants Extension OID cache.
2877 * @param q Query to inspect.
2878 * @return True if any GROUP BY key contains a @c provenance() call.
2879 */
2881 Query *q) {
2882 ListCell *lc;
2883
2884 /* Build the set of ressortgrouprefs that are actually in GROUP BY
2885 * (not ORDER BY or DISTINCT, which also set ressortgroupref). */
2886 Bitmapset *group_refs = NULL;
2887 foreach (lc, q->groupClause) {
2888 SortGroupClause *sgc = (SortGroupClause *)lfirst(lc);
2889 group_refs = bms_add_member(group_refs, sgc->tleSortGroupRef);
2890 }
2891
2892 foreach (lc, q->targetList) {
2893 TargetEntry *te = (TargetEntry *)lfirst(lc);
2894 if (te->ressortgroupref > 0 &&
2895 bms_is_member(te->ressortgroupref, group_refs)) {
2896 if(expression_tree_walker((Node *)te, provenance_function_walker,
2897 (void *)constants)) {
2898 return true;
2899 }
2900
2901#if PG_VERSION_NUM >= 180000
2902 // Starting from PostgreSQL 18, the content of the GROUP BY is not
2903 // in the groupClause but in an associated RTE_GROUP RangeTblEntry
2904 if(IsA(te->expr, Var)) {
2905 Var *v = (Var *) te->expr;
2906 RangeTblEntry *r = (RangeTblEntry *)list_nth(q->rtable, v->varno - 1);
2907 if(r->rtekind == RTE_GROUP)
2908 if(expression_tree_walker((Node *) r->groupexprs, provenance_function_walker,
2909 (void *)constants)) {
2910 return true;
2911 }
2912 }
2913#endif
2914 }
2915 }
2916
2917 return false;
2918}
2919
2920/**
2921 * @brief Tree walker that detects any provenance-bearing relation or provenance() call.
2922 * @param node Current expression tree node.
2923 * @param data Pointer to @c constants_t (cast from @c void*).
2924 * @return @c true if provenance rewriting is needed for this node.
2925 */
2926/**
2927 * @brief Recursive helper for @c has_provenance_walker that detects
2928 * rv_cmp @c OpExpr and @c provenance() @c FuncExpr in
2929 * expression subtrees.
2930 *
2931 * Stops at Query boundaries: @c SubLink subselects (used as
2932 * scalar/array subqueries in expressions) are not rewritten by the
2933 * outer planner_hook pass, so a tracked relation inside one must not
2934 * cause the OUTER query's gate to engage. Only the @c testexpr of a
2935 * SubLink is followed (it lives in the outer's evaluation scope).
2936 */
2937static bool has_rv_or_provenance_call(Node *node, void *data) {
2938 const constants_t *constants = (const constants_t *)data;
2939 if (node == NULL)
2940 return false;
2941
2942 if (IsA(node, OpExpr)) {
2943 OpExpr *op = (OpExpr *)node;
2944 if (rv_cmp_index(constants, op->opfuncid) >= 0)
2945 return true;
2946 }
2947
2948 if (IsA(node, FuncExpr)) {
2949 FuncExpr *f = (FuncExpr *)node;
2950 if (f->funcid == constants->OID_FUNCTION_PROVENANCE)
2951 return true;
2952 }
2953
2954 if (IsA(node, SubLink)) {
2955 SubLink *sl = (SubLink *)node;
2956 return has_rv_or_provenance_call((Node *)sl->testexpr, data);
2957 }
2958
2959 /* Query nodes are opaque here; expression_tree_walker returns false
2960 * on them. Explicit short-circuit just makes the intent obvious. */
2961 if (IsA(node, Query))
2962 return false;
2963
2964 return expression_tree_walker(node, has_rv_or_provenance_call, data);
2965}
2966
2967static bool has_provenance_walker(Node *node, void *data) {
2968 const constants_t *constants = (const constants_t *)data;
2969 if (node == NULL)
2970 return false;
2971
2972 if (IsA(node, Query)) {
2973 Query *q = (Query *)node;
2974 ListCell *rc;
2975
2976 /* Walk into CTE subqueries explicitly: they will be inlined as
2977 * subqueries by the rewriter, so a tracked-table inside one (or
2978 * an rv_cmp / provenance() call) matters for this query. */
2979 foreach (rc, q->cteList) {
2980 CommonTableExpr *cte = (CommonTableExpr *)lfirst(rc);
2981 if (has_provenance_walker((Node *)cte->ctequery, data))
2982 return true;
2983 }
2984
2985 /* Walk this query's own expressions for rv_cmp OpExpr and
2986 * provenance() FuncExpr. Use the SubLink-aware walker so we
2987 * don't descend into expression-context subqueries (they get
2988 * planned standalone; an rv_cmp inside one matters only to
2989 * that planning pass).
2990 *
2991 * This intentionally replaces a single query_tree_walker call:
2992 * that helper recurses with the passed walker into BOTH rtable
2993 * RTEs (RTE_SUBQUERY) and SubLink subselects, which would erase
2994 * the SubLink/RTE_SUBQUERY distinction we need. */
2995 if (has_rv_or_provenance_call((Node *)q->targetList, data))
2996 return true;
2997 if (has_rv_or_provenance_call((Node *)q->jointree, data))
2998 return true;
2999 if (has_rv_or_provenance_call((Node *)q->havingQual, data))
3000 return true;
3001 if (has_rv_or_provenance_call((Node *)q->returningList, data))
3002 return true;
3003
3004 foreach (rc, q->rtable) {
3005 RangeTblEntry *r = (RangeTblEntry *)lfirst(rc);
3006 if (r->rtekind == RTE_RELATION) {
3007 ListCell *lc;
3008 AttrNumber attid = 1;
3009
3010 foreach (lc, r->eref->colnames) {
3011 const char *v = strVal(lfirst(lc));
3012
3013 if (!strcmp(v, PROVSQL_COLUMN_NAME) &&
3014 get_atttype(r->relid, attid) == constants->OID_TYPE_UUID) {
3015 return true;
3016 }
3017
3018 ++attid;
3019 }
3020 } else if (r->rtekind == RTE_FUNCTION) {
3021 ListCell *lc;
3022 AttrNumber attid = 1;
3023
3024 foreach (lc, r->functions) {
3025 RangeTblFunction *func = (RangeTblFunction *)lfirst(lc);
3026
3027 if (func->funccolcount == 1) {
3028 FuncExpr *expr = (FuncExpr *)func->funcexpr;
3029 if (expr->funcresulttype == constants->OID_TYPE_UUID &&
3030 !strcmp(get_rte_attribute_name(r, attid),
3032 return true;
3033 }
3034 }
3035
3036 attid += func->funccolcount;
3037 }
3038 } else if (r->rtekind == RTE_SUBQUERY && r->subquery != NULL) {
3039 /* A FROM-source subquery contributes its provenance to ours;
3040 * process_query recurses on it explicitly, so we must detect
3041 * tracked relations / rv_cmp / provenance() inside it. */
3042 if (has_provenance_walker((Node *)r->subquery, data))
3043 return true;
3044 }
3045 }
3046 }
3047
3048 /* For non-Query nodes, use the expression-only walker. It detects
3049 * rv_cmp OpExpr and provenance() FuncExpr inside arbitrary
3050 * sub-expressions (BoolExpr around an rv comparison, RV cmp under
3051 * IS-DISTINCT-FROM, ...) but stops at Query boundaries so a sibling
3052 * subquery's tracked rtable doesn't make THIS query's gate engage
3053 * (subqueries have their own planner_hook pass). */
3054 return has_rv_or_provenance_call(node, data);
3055}
3056
3057/**
3058 * @brief Return true if @p q involves any provenance-bearing relation or
3059 * contains an explicit @c provenance() call.
3060 *
3061 * This is the gate condition checked by @c provsql_planner before doing any
3062 * rewriting: if neither condition holds the query is passed through unchanged.
3063 *
3064 * @param constants Extension OID cache.
3065 * @param q Query to inspect.
3066 * @return True if provenance rewriting is needed.
3067 */
3068static bool has_provenance(const constants_t *constants, Query *q) {
3069 return has_provenance_walker((Node *)q, (void *)constants);
3070}
3071
3072/**
3073 * @brief Tree walker that detects any Var of type agg_token.
3074 * @param node Current expression tree node.
3075 * @param data Pointer to a @c constants_t (extension OID cache).
3076 * @return @c true if an agg_token Var is found in @p node.
3077 */
3078static bool aggtoken_walker(Node *node, void *data) {
3079 const constants_t *constants = (const constants_t *) data;
3080 if (node == NULL)
3081 return false;
3082
3083 if (IsA(node, Var)) {
3084 Var *v = (Var *) node;
3085 if(v->vartype == constants->OID_TYPE_AGG_TOKEN)
3086 return true;
3087 }
3088
3089 return expression_tree_walker(node, aggtoken_walker, data);
3090}
3091
3092/**
3093 * @brief Return true if @p node contains a @c Var of type @c agg_token.
3094 *
3095 * Used to detect whether a WHERE clause references an aggregate result
3096 * (which must be moved to HAVING).
3097 *
3098 * @param node Expression tree to inspect.
3099 * @param constants Extension OID cache.
3100 * @return True if an @c agg_token @c Var is found anywhere in @p node.
3101 */
3102static bool has_aggtoken(Node *node, const constants_t *constants) {
3103 return expression_tree_walker(node, aggtoken_walker, (void*) constants);
3104}
3105
3106/**
3107 * @brief Walker for @c needs_having_lift: detect any operand shape that
3108 * the HAVING-lift rewriter (@c having_OpExpr_to_provenance_cmp)
3109 * needs to handle specially.
3110 *
3111 * Returns @c true on:
3112 * - a @c Var of type @c agg_token; or
3113 * - a @c FuncExpr whose @c funcid is @c provenance_aggregate (the
3114 * wrapper the planner-hook puts around aggregates over tracked
3115 * non-RV columns -- yields @c agg_token).
3116 *
3117 * Anything else (deterministic scalars, plain @c Const, @c FuncExpr
3118 * over @c random_variable like @c expected / @c variance / @c moment,
3119 * comparisons of those) is left for PostgreSQL to evaluate natively;
3120 * the HAVING-lift never needs to touch it.
3121 */
3122static bool having_lift_walker(Node *node, void *data) {
3123 const constants_t *constants = (const constants_t *) data;
3124 if (node == NULL)
3125 return false;
3126
3127 if (IsA(node, Var)) {
3128 Var *v = (Var *) node;
3129 if (v->vartype == constants->OID_TYPE_AGG_TOKEN)
3130 return true;
3131 }
3132
3133 if (IsA(node, FuncExpr)) {
3134 FuncExpr *fe = (FuncExpr *) node;
3135 if (fe->funcid == constants->OID_FUNCTION_PROVENANCE_AGGREGATE)
3136 return true;
3137 }
3138
3139 return expression_tree_walker(node, having_lift_walker, data);
3140}
3141
3142/**
3143 * @brief Return true if @p havingQual contains anything the HAVING-lift
3144 * path needs to handle (an @c agg_token Var or a
3145 * @c provenance_aggregate wrapper). A qual that returns @c false
3146 * is left in place for PostgreSQL to evaluate, while the
3147 * per-group provenance still gets a @c gate_delta wrapper.
3148 *
3149 * This is what lets a HAVING like @c expected(avg(rv)) > 20 work
3150 * directly: @c provsql.avg returns @c random_variable (not
3151 * @c agg_token), @c expected collapses to a scalar @c double, and the
3152 * surrounding comparison is a plain Boolean that PostgreSQL can filter
3153 * groups by without any provenance-side rewriting.
3154 */
3155static bool needs_having_lift(Node *havingQual, const constants_t *constants) {
3156 return expression_tree_walker(havingQual, having_lift_walker,
3157 (void *) constants);
3158}
3159
3160/**
3161 * @brief Rewrite an EXCEPT query into a LEFT JOIN with monus provenance.
3162 *
3163 * EXCEPT cannot be handled directly because it deduplicates. This function
3164 * transforms:
3165 * @code
3166 * SELECT … FROM A EXCEPT SELECT … FROM B
3167 * @endcode
3168 * into a LEFT JOIN of A and B on equality of all non-provenance columns,
3169 * clears @c setOperations, and leaves the monus token combination to
3170 * @c make_provenance_expression (which will see @c SR_MONUS).
3171 *
3172 * Only simple (non-chained) EXCEPT is supported; chained EXCEPT raises an
3173 * error.
3174 *
3175 * @param constants Extension OID cache.
3176 * @param q Query to rewrite in place.
3177 * @return Always true (errors out on unsupported cases).
3178 */
3179static bool transform_except_into_join(const constants_t *constants, Query *q) {
3180 SetOperationStmt *setOps = (SetOperationStmt *)q->setOperations;
3181 RangeTblEntry *rte = makeNode(RangeTblEntry);
3182 FromExpr *fe = makeNode(FromExpr);
3183 JoinExpr *je = makeNode(JoinExpr);
3184 BoolExpr *expr = makeNode(BoolExpr);
3185 ListCell *lc;
3186 int attno = 1;
3187
3188 if (!IsA(setOps->larg, RangeTblRef) || !IsA(setOps->rarg, RangeTblRef)) {
3189 provsql_error("Unsupported chain of EXCEPT operations");
3190 }
3191
3192 expr->boolop = AND_EXPR;
3193 expr->location = -1;
3194 expr->args = NIL;
3195
3196 foreach (lc, q->targetList) {
3197 TargetEntry *te = (TargetEntry *)lfirst(lc);
3198 Var *v;
3199
3200 if (!IsA(te->expr, Var))
3201 provsql_error("EXCEPT query format not supported");
3202
3203 v = (Var *)te->expr;
3204
3205 if (v->vartype != constants->OID_TYPE_UUID) {
3206 OpExpr *oe = makeNode(OpExpr);
3207 Oid opno = find_equality_operator(v->vartype, v->vartype);
3208 Operator opInfo = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno));
3209 Form_pg_operator opform;
3210 Var *leftArg, *rightArg;
3211
3212 if (!HeapTupleIsValid(opInfo))
3213 provsql_error("could not find operator with OID %u to compare variables of type %u",
3214 opno, v->vartype);
3215
3216 opform = (Form_pg_operator)GETSTRUCT(opInfo);
3217 leftArg = makeNode(Var);
3218 rightArg = makeNode(Var);
3219
3220 oe->opno = opno;
3221 oe->opfuncid = opform->oprcode;
3222 oe->opresulttype = opform->oprresult;
3223 oe->opcollid = InvalidOid;
3224 oe->inputcollid = DEFAULT_COLLATION_OID;
3225
3226 leftArg->varno = ((RangeTblRef *)setOps->larg)->rtindex;
3227 rightArg->varno = ((RangeTblRef *)setOps->rarg)->rtindex;
3228 leftArg->varattno = rightArg->varattno = attno;
3229
3230#if PG_VERSION_NUM >= 130000
3231 leftArg->varnosyn = rightArg->varnosyn = 0;
3232 leftArg->varattnosyn = rightArg->varattnosyn = 0;
3233#else
3234 leftArg->varnoold = leftArg->varno;
3235 rightArg->varnoold = rightArg->varno;
3236 leftArg->varoattno = rightArg->varoattno = attno;
3237#endif
3238
3239 leftArg->vartype = rightArg->vartype = v->vartype;
3240 leftArg->varcollid = rightArg->varcollid = InvalidOid;
3241 leftArg->vartypmod = rightArg->vartypmod = -1;
3242 leftArg->location = rightArg->location = -1;
3243
3244 oe->args = list_make2(leftArg, rightArg);
3245 oe->location = -1;
3246 expr->args = lappend(expr->args, oe);
3247
3248 ReleaseSysCache(opInfo);
3249 }
3250
3251 ++attno;
3252 }
3253
3254 /* Populate the JOIN RTE's eref / joinaliasvars / joinleftcols /
3255 * joinrightcols by walking the larg and rarg subqueries' targetLists.
3256 * Execution doesn't need these (outer Vars reference the input RTEs
3257 * directly), but PostgreSQL's ruleutils deparser walks them when
3258 * pg_get_querydef / EXPLAIN VERBOSE traverse the rewritten tree and
3259 * segfaults on NULL eref. Non-USING LEFT JOIN: joinmergedcols = 0,
3260 * output is left columns followed by right columns. */
3261 {
3262 RangeTblRef *larg_ref = (RangeTblRef *)setOps->larg;
3263 RangeTblRef *rarg_ref = (RangeTblRef *)setOps->rarg;
3264 RangeTblEntry *larg_rte =
3265 (RangeTblEntry *)list_nth(q->rtable, larg_ref->rtindex - 1);
3266 RangeTblEntry *rarg_rte =
3267 (RangeTblEntry *)list_nth(q->rtable, rarg_ref->rtindex - 1);
3268 List *aliasvars = NIL;
3269 List *leftcols = NIL;
3270 List *rightcols = NIL;
3271 List *colnames = NIL;
3272 ListCell *lc_te;
3273 int colno;
3274
3275 colno = 1;
3276 foreach (lc_te, larg_rte->subquery->targetList) {
3277 TargetEntry *te = (TargetEntry *)lfirst(lc_te);
3278 if (te->resjunk) {
3279 colno++;
3280 continue;
3281 }
3282 aliasvars = lappend(aliasvars,
3283 makeVar(larg_ref->rtindex, colno,
3284 exprType((Node *)te->expr),
3285 exprTypmod((Node *)te->expr),
3286 exprCollation((Node *)te->expr),
3287 0));
3288 leftcols = lappend_int(leftcols, colno);
3289 rightcols = lappend_int(rightcols, 0);
3290 colnames = lappend(colnames,
3291 makeString(pstrdup(te->resname ? te->resname
3292 : "?column?")));
3293 colno++;
3294 }
3295 colno = 1;
3296 foreach (lc_te, rarg_rte->subquery->targetList) {
3297 TargetEntry *te = (TargetEntry *)lfirst(lc_te);
3298 if (te->resjunk) {
3299 colno++;
3300 continue;
3301 }
3302 aliasvars = lappend(aliasvars,
3303 makeVar(rarg_ref->rtindex, colno,
3304 exprType((Node *)te->expr),
3305 exprTypmod((Node *)te->expr),
3306 exprCollation((Node *)te->expr),
3307 0));
3308 leftcols = lappend_int(leftcols, 0);
3309 rightcols = lappend_int(rightcols, colno);
3310 colnames = lappend(colnames,
3311 makeString(pstrdup(te->resname ? te->resname
3312 : "?column?")));
3313 colno++;
3314 }
3315
3316 rte->alias = NULL;
3317 rte->eref = makeAlias("unnamed_join", colnames);
3318 rte->joinaliasvars = aliasvars;
3319#if PG_VERSION_NUM >= 130000
3320 rte->joinleftcols = leftcols;
3321 rte->joinrightcols = rightcols;
3322 rte->joinmergedcols = 0;
3323#else
3324 (void) leftcols;
3325 (void) rightcols;
3326#endif
3327 }
3328
3329 rte->rtekind = RTE_JOIN;
3330 rte->jointype = JOIN_LEFT;
3331
3332 q->rtable = lappend(q->rtable, rte);
3333
3334 je->jointype = JOIN_LEFT;
3335
3336 je->larg = setOps->larg;
3337 je->rarg = setOps->rarg;
3338 je->quals = (Node *)expr;
3339 je->rtindex = list_length(q->rtable);
3340
3341 fe->fromlist = list_make1(je);
3342
3343 q->jointree = fe;
3344
3345 // TODO: Add group by in the right-side table
3346
3347 q->setOperations = 0;
3348
3349 return true;
3350}
3351
3352/**
3353 * @brief Recursively annotate a UNION tree with the provenance UUID type.
3354 *
3355 * Walks the @c SetOperationStmt tree of a UNION and appends the UUID type
3356 * to @c colTypes / @c colTypmods / @c colCollations on every node, and sets
3357 * @c all = true so that PostgreSQL does not deduplicate the combined stream.
3358 * The non-ALL deduplication has already been moved to an outer GROUP BY by
3359 * @c rewrite_non_all_into_external_group_by before this is called.
3360 *
3361 * @param constants Extension OID cache.
3362 * @param stmt Root (or subtree) of the UNION @c SetOperationStmt.
3363 * @param q Outer query (to look up subquery RTEs for agg_token type updates).
3364 */
3365static void process_set_operation_union(const constants_t *constants,
3366 SetOperationStmt *stmt,
3367 Query *q) {
3368 if (stmt->op != SETOP_UNION) {
3369 provsql_error("Unsupported mixed set operations");
3370 }
3371 if (IsA(stmt->larg, SetOperationStmt)) {
3372 process_set_operation_union(constants, (SetOperationStmt *)(stmt->larg), q);
3373 }
3374 if (IsA(stmt->rarg, SetOperationStmt)) {
3375 process_set_operation_union(constants, (SetOperationStmt *)(stmt->rarg), q);
3376 }
3377
3378 /* Update colTypes for columns that became agg_token after rewriting.
3379 * Use the left branch's subquery to detect agg_token columns. */
3380 if (IsA(stmt->larg, RangeTblRef)) {
3381 Index rtindex = ((RangeTblRef *)stmt->larg)->rtindex;
3382 RangeTblEntry *rte = list_nth_node(RangeTblEntry, q->rtable, rtindex - 1);
3383 if (rte->rtekind == RTE_SUBQUERY && rte->subquery != NULL) {
3384 ListCell *lc_type = list_head(stmt->colTypes);
3385 ListCell *lc_te = list_head(rte->subquery->targetList);
3386 while (lc_type != NULL && lc_te != NULL) {
3387 TargetEntry *te = (TargetEntry *)lfirst(lc_te);
3388 if (exprType((Node *)te->expr) == constants->OID_TYPE_AGG_TOKEN) {
3389 lfirst_oid(lc_type) = constants->OID_TYPE_AGG_TOKEN;
3390 }
3391 lc_type = my_lnext(stmt->colTypes, lc_type);
3392 lc_te = my_lnext(rte->subquery->targetList, lc_te);
3393 }
3394 }
3395 }
3396
3397 stmt->colTypes = lappend_oid(stmt->colTypes, constants->OID_TYPE_UUID);
3398 stmt->colTypmods = lappend_int(stmt->colTypmods, -1);
3399 stmt->colCollations = lappend_int(stmt->colCollations, 0);
3400 stmt->all = true;
3401}
3402
3403/**
3404 * @brief Add a WHERE condition filtering out zero-provenance tuples.
3405 *
3406 * For EXCEPT queries, tuples whose provenance evaluates to zero (i.e., the
3407 * right-hand side fully subsumes the left-hand side) must be excluded from
3408 * the result. This function appends @c provsql <> gate_zero() to
3409 * @p q->jointree->quals, ANDing with any existing WHERE condition.
3410 *
3411 * @param constants Extension OID cache.
3412 * @param q Query to modify in place.
3413 * @param provsql Provenance expression that was added to the SELECT list.
3414 */
3415static void add_select_non_zero(const constants_t *constants, Query *q,
3416 Expr *provsql) {
3417 FuncExpr *gate_zero = makeNode(FuncExpr);
3418 OpExpr *oe = makeNode(OpExpr);
3419
3420 gate_zero->funcid = constants->OID_FUNCTION_GATE_ZERO;
3421 gate_zero->funcresulttype = constants->OID_TYPE_UUID;
3422
3423 oe->opno = constants->OID_OPERATOR_NOT_EQUAL_UUID;
3424 oe->opfuncid = constants->OID_FUNCTION_NOT_EQUAL_UUID;
3425 oe->opresulttype = BOOLOID;
3426 oe->args = list_make2(provsql, gate_zero);
3427 oe->location = -1;
3428
3429 if (q->jointree->quals != NULL) {
3430 BoolExpr *be = makeNode(BoolExpr);
3431
3432 be->boolop = AND_EXPR;
3433 be->args = list_make2(oe, q->jointree->quals);
3434 be->location = -1;
3435
3436 q->jointree->quals = (Node *)be;
3437 } else
3438 q->jointree->quals = (Node *)oe;
3439}
3440
3441/**
3442 * @brief Append @p expr to @p havingQual with an AND, creating one if needed.
3443 *
3444 * If @p havingQual is NULL, returns @p expr directly. If it is already an
3445 * AND @c BoolExpr, appends to its argument list. Otherwise wraps both in a
3446 * new AND node.
3447 *
3448 * @param havingQual Existing HAVING qualifier, or NULL.
3449 * @param expr Expression to conjoin.
3450 * @return The updated HAVING qualifier.
3451 */
3452static Node *add_to_havingQual(Node *havingQual, Expr *expr)
3453{
3454 if(!havingQual) {
3455 havingQual = (Node*) expr;
3456 } else if(IsA(havingQual, BoolExpr) && ((BoolExpr*)havingQual)->boolop==AND_EXPR) {
3457 BoolExpr *be = (BoolExpr*)havingQual;
3458 be->args = lappend(be->args, expr);
3459 } else if(IsA(havingQual, OpExpr) || IsA(havingQual, BoolExpr)) {
3460 /* BoolExpr that is not an AND (OR/NOT): wrap with a new AND node. */
3461 BoolExpr *be = makeNode(BoolExpr);
3462 be->boolop=AND_EXPR;
3463 be->location=-1;
3464 be->args = list_make2(havingQual, expr);
3465 havingQual = (Node*) be;
3466 } else
3467 provsql_error("Unknown structure within Boolean expression");
3468
3469 return havingQual;
3470}
3471
3472/**
3473 * @brief Check whether @p op is a supported comparison on an aggregate result.
3474 *
3475 * Returns true iff @p op is a two-argument operator where at least one
3476 * argument is a @c Var of type @c agg_token (or an implicit-cast wrapper
3477 * thereof) and the other is a @c Const (possibly cast). This is the set
3478 * of WHERE-on-aggregate patterns that ProvSQL can safely move to a HAVING
3479 * clause.
3480 *
3481 * @param op The @c OpExpr to inspect.
3482 * @param constants Extension OID cache.
3483 * @return True if the pattern is supported, false otherwise.
3484 */
3485static bool check_selection_on_aggregate(OpExpr *op, const constants_t *constants)
3486{
3487 bool ok=true;
3488 bool found_agg_token=false;
3489
3490 if(op->args->length != 2)
3491 return false;
3492
3493 for(unsigned i=0; i<2; ++i) {
3494 Node *arg = lfirst(list_nth_cell(op->args, i));
3495
3496 // Check both arguments are either an aggtoken or a constant
3497 // (possibly after a cast)
3498 if((IsA(arg, Var) && ((Var*)arg)->vartype==constants->OID_TYPE_AGG_TOKEN)) {
3499 found_agg_token=true;
3500 } else if(IsA(arg, Const)) {
3501 } else if(IsA(arg, FuncExpr)) {
3502 FuncExpr *fe = (FuncExpr*) arg;
3503 if(fe->funcformat != COERCE_IMPLICIT_CAST && fe->funcformat != COERCE_EXPLICIT_CAST) {
3504 ok=false;
3505 break;
3506 }
3507 if(fe->args->length != 1) {
3508 ok=false;
3509 break;
3510 }
3511 if(!IsA(lfirst(list_head(fe->args)), Const)) {
3512 ok=false;
3513 break;
3514 }
3515 } else {
3516 ok=false;
3517 break;
3518 }
3519 }
3520
3521 return ok && found_agg_token;
3522}
3523
3524/**
3525 * @brief Check whether every leaf of a Boolean expression is a supported
3526 * comparison on an aggregate result.
3527 *
3528 * Recursively validates @c OpExpr leaves via @c check_selection_on_aggregate
3529 * and descends into nested @c BoolExpr nodes.
3530 *
3531 * @param be The Boolean expression to validate.
3532 * @param constants Extension OID cache.
3533 * @return True if all leaves are supported, false if any is not.
3534 */
3535static bool check_boolexpr_on_aggregate(BoolExpr *be, const constants_t *constants)
3536{
3537 ListCell *lc;
3538
3539 foreach (lc, be->args) {
3540 Node *n=lfirst(lc);
3541 if(IsA(n, OpExpr)) {
3542 if(!check_selection_on_aggregate((OpExpr*) n, constants))
3543 return false;
3544 } else if(IsA(n, BoolExpr)) {
3545 if(!check_boolexpr_on_aggregate((BoolExpr*) n, constants))
3546 return false;
3547 } else
3548 return false;
3549 }
3550
3551 return true;
3552}
3553
3554/**
3555 * @brief Top-level dispatcher for supported WHERE-on-aggregate patterns.
3556 *
3557 * @param expr Expression to validate (@c OpExpr or @c BoolExpr).
3558 * @param constants Extension OID cache.
3559 * @return True if ProvSQL can handle this expression.
3560 */
3561static bool check_expr_on_aggregate(Expr *expr, const constants_t *constants) {
3562 switch(expr->type) {
3563 case T_BoolExpr:
3564 return check_boolexpr_on_aggregate((BoolExpr*) expr, constants);
3565 case T_OpExpr:
3566 return check_selection_on_aggregate((OpExpr*) expr, constants);
3567 default:
3568 provsql_error("Unknown structure within Boolean expression");
3569 }
3570}
3571
3572/* -------------------------------------------------------------------------
3573 * Main query transformation
3574 * ------------------------------------------------------------------------- */
3575
3576/**
3577 * @brief Build the per-RTE column-numbering map used by where-provenance.
3578 *
3579 * Assigns a sequential position (1, 2, 3, …) to every non-provenance,
3580 * non-join, non-empty column across all RTEs in @p q->rtable. The
3581 * @c provsql column is assigned -1 so callers can detect provenance-tracked
3582 * RTEs. Join-RTE columns and empty-named columns (used for anonymous GROUP
3583 * BY keys) are assigned 0.
3584 *
3585 * @note For @c RTE_RELATION entries that are provenance-tracked, the
3586 * sequential numbers produced here must @b not be used as PROJECT gate
3587 * positions. Because numbering is query-order-dependent, the sequential
3588 * number for a column of a provenance table that is not the first RTE
3589 * will exceed @c nb_columns of that table's IN gate, causing
3590 * @c WhereCircuit::evaluate() to return an empty locator set. Instead,
3591 * callers should use @c varattno directly (see
3592 * @c make_provenance_expression()). The -1 sentinel is the reliable
3593 * way to identify a provenance-tracked RTE.
3594 *
3595 * @param q Query whose range table is mapped.
3596 * @param columns Pre-allocated array of length @p q->rtable->length.
3597 * Each element is allocated and filled by this function.
3598 * @param nbcols Out-param: total number of non-provenance output columns.
3599 */
3600static void build_column_map(Query *q, int **columns, int *nbcols) {
3601 unsigned i = 0;
3602 ListCell *l;
3603
3604 *nbcols = 0;
3605
3606 foreach (l, q->rtable) {
3607 RangeTblEntry *r = (RangeTblEntry *)lfirst(l);
3608 ListCell *lc;
3609
3610 columns[i] = 0;
3611 if (r->eref) {
3612 unsigned j = 0;
3613
3614 columns[i] = (int *)palloc(r->eref->colnames->length * sizeof(int));
3615
3616 foreach (lc, r->eref->colnames) {
3617 if (!lfirst(lc)) {
3618 /* Column without name — used e.g. when grouping by a discarded column */
3619 columns[i][j] = ++(*nbcols);
3620 } else {
3621 const char *v = strVal(lfirst(lc));
3622
3623 if (strcmp(v, "") && r->rtekind != RTE_JOIN) { /* join RTE columns ignored */
3624 if (!strcmp(v, PROVSQL_COLUMN_NAME))
3625 columns[i][j] = -1;
3626 else
3627 columns[i][j] = ++(*nbcols);
3628 } else {
3629 columns[i][j] = 0;
3630 }
3631 }
3632
3633 ++j;
3634 }
3635 }
3636
3637 ++i;
3638 }
3639}
3640
3641/**
3642 * @brief Categorisation of a top-level WHERE conjunct.
3643 *
3644 * Drives the unified WHERE classifier that replaced the original pair
3645 * @c migrate_aggtoken_quals_to_having + @c extract_rv_cmps_from_quals.
3646 * Both probabilistic flavours (agg_token's "moved to HAVING" world and
3647 * random_variable's "lifted to provenance" world) are special cases of
3648 * "this conjunct involves a probabilistic value the executor cannot
3649 * evaluate as a Boolean directly, so the planner has to route it to a
3650 * different evaluation site". The classifier reports which site, or
3651 * (for unsupported mixes) errors.
3652 */
3653typedef enum {
3654 QUAL_DETERMINISTIC, /**< no probabilistic value; stays in WHERE */
3655 QUAL_PURE_AGG, /**< pure agg_token expression; route to HAVING */
3656 QUAL_PURE_RV, /**< pure random_variable expression; lift to provenance */
3657 QUAL_MIXED_AGG_DET, /**< agg_token mixed with non-agg leaves; error */
3658 QUAL_MIXED_RV_DET, /**< random_variable mixed with non-RV leaves; error */
3659 QUAL_MIXED_AGG_RV /**< agg_token and random_variable in the same expr; error */
3660} qual_class;
3661
3662/**
3663 * @brief Classify @p expr along the @c qual_class axis.
3664 *
3665 * Decision table (the predicates @c has_aggtoken,
3666 * @c expr_contains_rv_cmp, @c check_expr_on_aggregate, and
3667 * @c check_expr_on_rv each return whether the expression "contains" or
3668 * "is purely" the corresponding flavour):
3669 *
3670 * | aggtoken | rv_cmp | check_agg | check_rv | classification |
3671 * |----------|--------|-----------|----------|-----------------------|
3672 * | yes | yes | - | - | QUAL_MIXED_AGG_RV |
3673 * | yes | no | true | - | QUAL_PURE_AGG |
3674 * | yes | no | false | - | QUAL_MIXED_AGG_DET |
3675 * | no | yes | - | true | QUAL_PURE_RV |
3676 * | no | yes | - | false | QUAL_MIXED_RV_DET |
3677 * | no | no | - | - | QUAL_DETERMINISTIC |
3678 */
3679static qual_class classify_qual(Expr *expr, const constants_t *constants)
3680{
3681 bool has_agg = has_aggtoken((Node *)expr, constants);
3682 bool has_rv = expr_contains_rv_cmp((Node *)expr, constants);
3683
3684 if (has_agg && has_rv)
3685 return QUAL_MIXED_AGG_RV;
3686 if (has_agg) {
3687 if (check_expr_on_aggregate(expr, constants))
3688 return QUAL_PURE_AGG;
3689 return QUAL_MIXED_AGG_DET;
3690 }
3691 if (has_rv) {
3692 if (check_expr_on_rv(expr, constants))
3693 return QUAL_PURE_RV;
3694 return QUAL_MIXED_RV_DET;
3695 }
3696 return QUAL_DETERMINISTIC;
3697}
3698
3699/** @brief Raise the user-facing error appropriate to a mixed @p c.
3700 *
3701 * Each @c provsql_error call is @c ereport(ERROR), which does not
3702 * return; the explicit @c break statements below are present only to
3703 * keep @c -Wimplicit-fallthrough happy (PostgreSQL's @c elog macro is
3704 * not marked @c noreturn for the compiler's flow analysis). */
3706{
3707 switch (c) {
3708 case QUAL_MIXED_AGG_DET:
3709 provsql_error("Complex selection on aggregation results not supported");
3710 break;
3711 case QUAL_MIXED_RV_DET:
3712 provsql_error("WHERE clause mixes random_variable comparisons with "
3713 "other predicates inside the same Boolean expression; "
3714 "split the non-RV part into its own AND conjunct");
3715 break;
3716 case QUAL_MIXED_AGG_RV:
3717 provsql_error("WHERE clause mixes agg_token (HAVING-style) and "
3718 "random_variable (per-tuple) comparisons inside the "
3719 "same Boolean expression; this combination is not "
3720 "supported");
3721 break;
3722 default:
3723 /* QUAL_DETERMINISTIC / QUAL_PURE_AGG / QUAL_PURE_RV: not a mixed case. */
3724 break;
3725 }
3726}
3727
3728/**
3729 * @brief Unified WHERE classifier &ndash; routes each top-level conjunct
3730 * to the right evaluation site in a single pass.
3731 *
3732 * Replaces and consolidates the original
3733 * @c migrate_aggtoken_quals_to_having (agg-only) and
3734 * @c extract_rv_cmps_from_quals (rv-only). The two old functions were
3735 * structurally isomorphic: each walked the WHERE clause, classified
3736 * each top-level conjunct, and routed pure-X conjuncts somewhere
3737 * semantic (HAVING vs the returned rv_cmps list); the deterministic
3738 * conjuncts stayed in WHERE. Doing it in one pass means the rare
3739 * conjunct that mixes agg_token and random_variable (which neither old
3740 * function would have caught cleanly) gets a deterministic, useful
3741 * error message.
3742 *
3743 * Supported shapes mirror the union of the two predecessors:
3744 * - Whole WHERE is a single conjunct: classify and route or error.
3745 * - Top-level AND of conjuncts: classify each, route, and (after
3746 * walking) collapse the AND if it has zero or one remaining children
3747 * so downstream code does not see a degenerate Boolean node.
3748 * - Top-level OR / NOT containing both deterministic and probabilistic
3749 * leaves: error.
3750 *
3751 * @param constants Extension OID cache.
3752 * @param q Query whose @c jointree->quals and @c havingQual
3753 * may both be mutated in place.
3754 * @return List of @c FuncExpr nodes (one per lifted RV conjunct), each
3755 * producing a @c UUID. The caller conjoins these into
3756 * @c prov_atts before @c make_provenance_expression.
3757 */
3758static List *
3759migrate_probabilistic_quals(const constants_t *constants, Query *q)
3760{
3761 List *rv_cmps = NIL;
3762 Node *quals;
3763
3764 if (!q->jointree || !q->jointree->quals)
3765 return NIL;
3766
3767 quals = q->jointree->quals;
3768
3769 /* Whole WHERE is one conjunct (single OpExpr, or non-AND BoolExpr
3770 * which we treat opaquely &ndash; the per-flavour pure checks
3771 * @c check_expr_on_aggregate / @c check_expr_on_rv recurse through
3772 * the BoolExpr structure themselves). */
3773 if (!IsA(quals, BoolExpr) || ((BoolExpr *)quals)->boolop != AND_EXPR) {
3774 qual_class c = classify_qual((Expr *)quals, constants);
3776
3777 switch (c) {
3778 case QUAL_PURE_AGG:
3779 q->havingQual = add_to_havingQual(q->havingQual, (Expr *)quals);
3780 q->jointree->quals = NULL;
3781 break;
3782 case QUAL_PURE_RV:
3783 rv_cmps = lappend(rv_cmps,
3784 rv_Expr_to_provenance((Expr *)quals,
3785 constants, false));
3786 q->jointree->quals = NULL;
3787 break;
3788 case QUAL_DETERMINISTIC:
3789 /* Leave WHERE alone. */
3790 break;
3791 default:
3792 /* Errors handled by error_for_mixed_qual. */
3793 break;
3794 }
3795 return rv_cmps;
3796 }
3797
3798 /* Top-level AND: walk conjuncts. */
3799 {
3800 BoolExpr *be = (BoolExpr *)quals;
3801 ListCell *cell, *prev;
3802
3803 for (cell = list_head(be->args), prev = NULL; cell != NULL;) {
3804 Expr *conjunct = (Expr *)lfirst(cell);
3805 qual_class c = classify_qual(conjunct, constants);
3806
3808
3809 switch (c) {
3810 case QUAL_PURE_AGG:
3811 q->havingQual = add_to_havingQual(q->havingQual, conjunct);
3812 be->args = my_list_delete_cell(be->args, cell, prev);
3813 if (prev)
3814 cell = my_lnext(be->args, prev);
3815 else
3816 cell = list_head(be->args);
3817 break;
3818 case QUAL_PURE_RV:
3819 rv_cmps = lappend(rv_cmps,
3820 rv_Expr_to_provenance(conjunct,
3821 constants, false));
3822 be->args = my_list_delete_cell(be->args, cell, prev);
3823 if (prev)
3824 cell = my_lnext(be->args, prev);
3825 else
3826 cell = list_head(be->args);
3827 break;
3828 case QUAL_DETERMINISTIC:
3829 prev = cell;
3830 cell = my_lnext(be->args, cell);
3831 break;
3832 default:
3833 /* Errors handled by error_for_mixed_qual. */
3834 break;
3835 }
3836 }
3837
3838 /* Collapse degenerate ANDs so downstream code sees a tidy WHERE. */
3839 if (be->args == NIL)
3840 q->jointree->quals = NULL;
3841 else if (list_length(be->args) == 1)
3842 q->jointree->quals = (Node *)linitial(be->args);
3843 }
3844
3845 return rv_cmps;
3846}
3847
3848/** @brief Context for the @c insert_agg_token_casts_mutator. */
3850 Query *query; ///< Outer query (to look up subquery RTEs)
3851 const constants_t *constants; ///< Extension OID cache
3853
3854/**
3855 * @brief Look up the original aggregate return type for an agg_token Var.
3856 *
3857 * Navigates from the Var's varno/varattno to the subquery's target list,
3858 * finds the provenance_aggregate() FuncExpr, and extracts the type OID
3859 * from its second argument (aggtype).
3860 */
3862 RangeTblEntry *rte;
3863 TargetEntry *te;
3864
3865 if (v->varno < 1 || v->varno > list_length(ctx->query->rtable))
3866 return InvalidOid;
3867
3868 rte = list_nth_node(RangeTblEntry, ctx->query->rtable, v->varno - 1);
3869 if (rte->rtekind != RTE_SUBQUERY || rte->subquery == NULL)
3870 return InvalidOid;
3871
3872 if (v->varattno < 1 || v->varattno > list_length(rte->subquery->targetList))
3873 return InvalidOid;
3874
3875 te = list_nth_node(TargetEntry, rte->subquery->targetList, v->varattno - 1);
3876 if (IsA(te->expr, FuncExpr)) {
3877 FuncExpr *f = (FuncExpr *)te->expr;
3878 if (f->funcid == ctx->constants->OID_FUNCTION_PROVENANCE_AGGREGATE) {
3879 Const *aggtype_const = (Const *)lsecond(f->args);
3880 return DatumGetObjectId(aggtype_const->constvalue);
3881 }
3882 }
3883 return InvalidOid;
3884}
3885
3886/**
3887 * @brief Wrap an agg_token Var in a cast to its original type, in place.
3888 */
3889static void cast_agg_token_in_list(ListCell *lc,
3891 Var *v = (Var *)lfirst(lc);
3892 Oid target = get_agg_token_orig_type(v, ctx);
3893 HeapTuple castTuple;
3894
3895 if (!OidIsValid(target))
3896 return;
3897
3898 castTuple = SearchSysCache2(CASTSOURCETARGET,
3899 ObjectIdGetDatum(ctx->constants->OID_TYPE_AGG_TOKEN),
3900 ObjectIdGetDatum(target));
3901 if (HeapTupleIsValid(castTuple)) {
3902 Form_pg_cast castForm = (Form_pg_cast)GETSTRUCT(castTuple);
3903 if (OidIsValid(castForm->castfunc)) {
3904 FuncExpr *fc = makeNode(FuncExpr);
3905 fc->funcid = castForm->castfunc;
3906 fc->funcresulttype = target;
3907 fc->funcretset = false;
3908 fc->funcvariadic = false;
3909 fc->funcformat = COERCE_IMPLICIT_CAST;
3910 fc->funccollid = InvalidOid;
3911 fc->inputcollid = InvalidOid;
3912 fc->args = list_make1(v);
3913 fc->location = -1;
3914 lfirst(lc) = fc;
3915 }
3916 ReleaseSysCache(castTuple);
3917 }
3918}
3919
3920/**
3921 * @brief Wrap any agg_token Vars in an argument list.
3922 */
3923static void cast_agg_token_args(List *args,
3925 ListCell *lc;
3926 foreach (lc, args) {
3927 if (IsA(lfirst(lc), Var) &&
3928 ((Var *)lfirst(lc))->vartype == ctx->constants->OID_TYPE_AGG_TOKEN)
3929 cast_agg_token_in_list(lc, ctx);
3930 }
3931}
3932
3933/**
3934 * @brief Insert agg_token casts for Vars used in expressions.
3935 *
3936 * After the WHERE-to-HAVING migration, agg_token Vars remaining in
3937 * expression nodes (OpExpr, WindowFunc, CoalesceExpr, MinMaxExpr, etc.)
3938 * need explicit casts to their original type so that operators and
3939 * functions receive correct values. The original type is looked up
3940 * from the provenance_aggregate() call in the subquery.
3941 */
3942static Node *
3943insert_agg_token_casts_mutator(Node *node, void *data) {
3945
3946 if (node == NULL)
3947 return NULL;
3948
3949 if (IsA(node, OpExpr)) {
3950 cast_agg_token_args(((OpExpr *)node)->args, ctx);
3951 return (Node *)node;
3952 }
3953 if (IsA(node, WindowFunc)) {
3954 cast_agg_token_args(((WindowFunc *)node)->args, ctx);
3955 return (Node *)node;
3956 }
3957 if (IsA(node, CoalesceExpr)) {
3958 cast_agg_token_args(((CoalesceExpr *)node)->args, ctx);
3959 return (Node *)node;
3960 }
3961 if (IsA(node, MinMaxExpr)) {
3962 cast_agg_token_args(((MinMaxExpr *)node)->args, ctx);
3963 return (Node *)node;
3964 }
3965 if (IsA(node, NullIfExpr)) {
3966 cast_agg_token_args(((NullIfExpr *)node)->args, ctx);
3967 return (Node *)node;
3968 }
3969
3970 return expression_tree_mutator(node, insert_agg_token_casts_mutator, data);
3971}
3972
3973/**
3974 * @brief Walk query and insert agg_token casts where needed.
3975 */
3976static void insert_agg_token_casts(const constants_t *constants, Query *q) {
3977 insert_agg_token_casts_context ctx = {q, constants};
3978 query_tree_mutator(q, insert_agg_token_casts_mutator, &ctx,
3979 QTW_DONT_COPY_QUERY | QTW_IGNORE_RC_SUBQUERIES);
3980}
3981
3982/**
3983 * @brief Wrap @p expr in a @c provsql.assume_boolean FuncExpr.
3984 *
3985 * Used by @c make_provenance_expression when its caller (the
3986 * safe-query rewrite path in @c process_query) flagged the result
3987 * as needing the @c gate_assumed_boolean structural marker.
3988 * Wrapping at expression-build time rather than at splice time
3989 * means @c add_to_select and
3990 * @c replace_provenance_function_by_expression both consume the
3991 * already-wrapped expression, so every per-row root occurrence in
3992 * the final target list -- the auto-added @c provsql column and
3993 * every substituted user-side @c provenance() call -- carries the
3994 * wrapper uniformly.
3995 *
3996 * @param constants Extension OID cache.
3997 * @param expr Provenance expression to wrap.
3998 * @return A @c FuncExpr applying @c provsql.assume_boolean to @p expr.
3999 */
4000static Expr *wrap_in_assume_boolean(const constants_t *constants,
4001 Expr *expr) {
4002 FuncExpr *wrap = makeNode(FuncExpr);
4003 wrap->funcid = constants->OID_FUNCTION_ASSUME_BOOLEAN;
4004 wrap->funcresulttype = constants->OID_TYPE_UUID;
4005 wrap->funcretset = false;
4006 wrap->funcvariadic = false;
4007 wrap->funcformat = COERCE_EXPLICIT_CALL;
4008 wrap->funccollid = InvalidOid;
4009 wrap->inputcollid = InvalidOid;
4010 wrap->args = list_make1(expr);
4011 wrap->location = -1;
4012 return (Expr *) wrap;
4013}
4014
4015/**
4016 * @brief Rewrite a single SELECT query to carry provenance.
4017 *
4018 * This is the recursive entry point for the provenance rewriter. It is
4019 * called from @c provsql_planner for top-level queries and re-entered from
4020 * @c get_provenance_attributes for subqueries in FROM.
4021 *
4022 * High-level steps:
4023 * 1. Strip any @c provsql column propagated into this query's target list.
4024 * 2. Detect and rewrite structural forms requiring pre-processing:
4025 * non-ALL set operations (wrap in outer GROUP BY), AGG DISTINCT (push
4026 * into a subquery), DISTINCT (convert to GROUP BY).
4027 * 3. Collect provenance attributes via @c get_provenance_attributes.
4028 * 4. Build a column-numbering map for where-provenance (@c build_column_map).
4029 * 5. Handle aggregates, migrate WHERE-on-aggregate to HAVING, and set ops.
4030 * 6. Build and splice the combined provenance expression.
4031 *
4032 * @param constants Extension OID cache.
4033 * @param q Query to rewrite (modified in place).
4034 * @param removed Out-param: boolean array indicating which original target
4035 * list entries were provenance columns and were removed.
4036 * May be @c NULL if the caller does not need this info.
4037 * @param wrap_root If true, mark this query's provenance expression as a
4038 * safe-query root that must be wrapped in
4039 * @c provsql.assume_boolean before splicing.
4040 * @return The (possibly restructured) rewritten query, or @c NULL if the
4041 * query has no FROM clause and can be skipped.
4042 */
4043static Query *process_query(const constants_t *constants, Query *q,
4044 bool **removed, bool wrap_root) {
4045 List *prov_atts;
4046 bool has_union = false;
4047 bool has_difference = false;
4048 bool supported = true;
4049 bool group_by_rewrite = false;
4050 int nbcols = 0;
4051 int **columns;
4052 unsigned i = 0;
4053 if (provsql_verbose >= 50)
4054 elog_node_display(NOTICE, "ProvSQL: Before query rewriting", q, true);
4055
4056 if (q->rtable == NULL) {
4057 /* FROM-less SELECT: the rest of the rewriter indexes into
4058 * q->rtable, so it can't process anything tied to a base relation.
4059 * But a WHERE-on-RV is still meaningful in this shape (e.g.
4060 * SELECT 1 WHERE normal(0,1) > 2)
4061 * since the comparison produces a pure-rv gate that's lifted into
4062 * a synthesised provsql column on the single result row. Run only
4063 * the qual migration + targetList splice and return; everything
4064 * else this function does (column mapping, set-ops, aggregation
4065 * rewriting, ...) assumes a non-empty rtable. */
4066 List *rv_cmps = migrate_probabilistic_quals(constants, q);
4067 if (rv_cmps != NIL) {
4068 Expr *provenance;
4069 RangeTblEntry *values_rte;
4070 RangeTblRef *rtr;
4071 Var *v;
4072
4073 if (list_length(rv_cmps) == 1) {
4074 provenance = (Expr *)linitial(rv_cmps);
4075 } else {
4076 /* Multiple rv conjuncts: combine via provenance_times. */
4077 FuncExpr *times = makeNode(FuncExpr);
4078 ArrayExpr *array = makeNode(ArrayExpr);
4079 times->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
4080 times->funcresulttype = constants->OID_TYPE_UUID;
4081 times->funcvariadic = true;
4082 times->location = -1;
4083 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
4084 array->element_typeid = constants->OID_TYPE_UUID;
4085 array->elements = rv_cmps;
4086 array->location = -1;
4087 times->args = list_make1(array);
4088 provenance = (Expr *)times;
4089 }
4090
4091 /* Bind the lifted expression to a single evaluation by wrapping
4092 * it in a synthesized FROM (VALUES (<expr>)) AS _prov_(provsql).
4093 * Without this, multiple references to the same provenance
4094 * expression in the outer targetList (the user's provenance()
4095 * call, plus the auto-added provsql column) each re-invoke any
4096 * rv constructor inside, producing distinct UUIDs per call
4097 * because uniform / normal / ... mint a fresh leaf gate each
4098 * time. Wrapping in VALUES gives one evaluation site that all
4099 * outer references read from. */
4100 values_rte = makeNode(RangeTblEntry);
4101 values_rte->rtekind = RTE_VALUES;
4102 values_rte->values_lists = list_make1(list_make1(provenance));
4103 values_rte->coltypes = list_make1_oid(constants->OID_TYPE_UUID);
4104 values_rte->coltypmods = list_make1_int(-1);
4105 values_rte->colcollations = list_make1_oid(InvalidOid);
4106 values_rte->eref = makeAlias(
4107 "_prov_",
4108 list_make1(makeString(pstrdup(PROVSQL_COLUMN_NAME))));
4109 values_rte->inh = false;
4110 values_rte->inFromCl = true;
4111#if PG_VERSION_NUM < 160000
4112 values_rte->requiredPerms = 0;
4113#endif
4114 q->rtable = list_make1(values_rte);
4115
4116 rtr = makeNode(RangeTblRef);
4117 rtr->rtindex = 1;
4118 if (q->jointree == NULL) {
4119 q->jointree = makeNode(FromExpr);
4120 }
4121 q->jointree->fromlist = list_make1(rtr);
4122
4123 v = makeVar(1, 1, constants->OID_TYPE_UUID, -1, InvalidOid, 0);
4124
4125 /* Substitute any provenance() FuncExpr in the targetList with
4126 * a reference to the bound expression. */
4127 replace_provenance_function_by_expression(constants, q, (Expr *)v);
4128
4129 /* Append a provsql column reading the same Var so callers that
4130 * expect the auto-added column find it. */
4131 {
4132 TargetEntry *te = makeTargetEntry(
4133 (Expr *)copyObject(v),
4134 list_length(q->targetList) + 1,
4135 pstrdup(PROVSQL_COLUMN_NAME),
4136 false);
4137 q->targetList = lappend(q->targetList, te);
4138 }
4139 }
4140 return q;
4141 }
4142
4143 /* Inline non-recursive CTE references as subqueries so we can track
4144 * provenance through them. Must happen before set operation handling
4145 * since UNION/EXCEPT branches may reference CTEs. */
4146 inline_ctes(q);
4147
4148 {
4149 Bitmapset *removed_sortgrouprefs = NULL;
4150
4151 if (q->targetList) {
4152 removed_sortgrouprefs =
4153 remove_provenance_attributes_select(constants, q, removed);
4154 if (removed_sortgrouprefs != NULL)
4155 remove_provenance_attribute_groupref(q, removed_sortgrouprefs);
4156 if (q->setOperations)
4158 }
4159 }
4160
4161 if(provsql_active) {
4162 columns = (int **)palloc(q->rtable->length * sizeof(int *));
4163
4164 if (q->setOperations) {
4165 // TODO: Nest set operations as subqueries in FROM,
4166 // so that we only do set operations on base tables
4167
4168 SetOperationStmt *stmt = (SetOperationStmt *)q->setOperations;
4169 if (!stmt->all) {
4170 /* Check if any branch has aggregates — non-ALL set operations
4171 * on aggregate results are not supported because agg_token
4172 * lacks comparison operators for deduplication */
4173 ListCell *lc_rte;
4174 foreach (lc_rte, q->rtable) {
4175 RangeTblEntry *rte = (RangeTblEntry *)lfirst(lc_rte);
4176 if (rte->rtekind == RTE_SUBQUERY && rte->subquery &&
4177 rte->subquery->hasAggs)
4178 provsql_error("Non-ALL set operations (UNION, EXCEPT) on "
4179 "aggregate results not supported");
4180 }
4182 return process_query(constants, q, removed, wrap_root);
4183 }
4184 }
4185
4186 if (q->hasAggs) {
4187 Query *rewritten = rewrite_agg_distinct(q, constants);
4188 if (rewritten)
4189 return process_query(constants, rewritten, removed, wrap_root);
4190 }
4191
4192 /* Opt-in safe-query optimisation slot: when on, try to rewrite
4193 * hierarchical conjunctive queries to a read-once form whose
4194 * probability is computable in linear time via independent
4195 * evaluation. See try_safe_query_rewrite().
4196 *
4197 * The rewriter is gated on the presence of the assume_boolean()
4198 * helper (installed by the 1.6.0 upgrade script). Without it we
4199 * cannot wrap the per-row root in a gate_assumed_boolean, which is
4200 * what downstream evaluators inspect to refuse unsound evaluation,
4201 * so we refuse to rewrite on schemas that still predate the
4202 * helper. */
4204 OidIsValid(constants->OID_FUNCTION_ASSUME_BOOLEAN)) {
4205 Query *rewritten = try_safe_query_rewrite(constants, q);
4206 if (rewritten)
4207 return process_query(constants, rewritten, removed, true);
4208 }
4209
4210 // get_provenance_attributes will also recursively process subqueries
4211 // by calling process_query
4212 prov_atts = get_provenance_attributes(constants, q);
4213
4214 if (prov_atts == NIL) {
4215 /* If the WHERE clause contains a random_variable comparison, we
4216 * still need to take the rewriting path so the result tuple
4217 * carries the comparator's gate_cmp UUID as its provenance.
4218 * Synthesize a single gate_one() prov_att; the combination
4219 * provenance_times(one, rv_cmp) collapses to rv_cmp downstream
4220 * because gate_one is the multiplicative identity. */
4221 if (q->jointree && q->jointree->quals &&
4222 expr_contains_rv_cmp(q->jointree->quals, constants)) {
4223 FuncExpr *one_expr = makeNode(FuncExpr);
4224 one_expr->funcid = constants->OID_FUNCTION_GATE_ONE;
4225 one_expr->funcresulttype = constants->OID_TYPE_UUID;
4226 one_expr->args = NIL;
4227 one_expr->location = -1;
4228 prov_atts = list_make1(one_expr);
4229 } else {
4230 return q;
4231 }
4232 }
4233
4234 if (q->hasSubLinks) {
4235 provsql_error("Subqueries (EXISTS, IN, scalar subquery) not supported");
4236 supported = false;
4237 }
4238
4239 if (supported && q->distinctClause) {
4240 if (q->hasDistinctOn) {
4241 provsql_error("DISTINCT ON not supported");
4242 supported = false;
4243 } else if (q->hasAggs) {
4244 provsql_error("DISTINCT on aggregate results not supported");
4245 } else if (list_length(q->distinctClause) < list_length(q->targetList)) {
4246 provsql_error("Inconsistent DISTINCT and GROUP BY clauses not "
4247 "supported");
4248 supported = false;
4249 } else {
4251 }
4252 }
4253
4254 if (supported && q->setOperations) {
4255 SetOperationStmt *stmt = (SetOperationStmt *)q->setOperations;
4256
4257 if (stmt->op == SETOP_UNION) {
4258 process_set_operation_union(constants, stmt, q);
4259 has_union = true;
4260 } else if (stmt->op == SETOP_EXCEPT) {
4261 if (!transform_except_into_join(constants, q))
4262 supported = false;
4263 has_difference = true;
4264 } else {
4265 provsql_error("Set operations other than UNION and EXCEPT not "
4266 "supported");
4267 supported = false;
4268 }
4269 }
4270
4271 if (supported && q->groupClause &&
4272 !provenance_function_in_group_by(constants, q)) {
4273 group_by_rewrite = true;
4274 }
4275
4276 if (supported && q->groupingSets) {
4277 if (q->groupClause || list_length(q->groupingSets) > 1 ||
4278 ((GroupingSet *)linitial(q->groupingSets))->kind !=
4279 GROUPING_SET_EMPTY) {
4280 provsql_error("GROUPING SETS, CUBE, and ROLLUP not supported");
4281 supported = false;
4282 } else {
4283 // Simple GROUP BY ()
4284 group_by_rewrite = true;
4285 }
4286 }
4287
4288 if (supported)
4289 build_column_map(q, columns, &nbcols);
4290
4291 if (supported) {
4292 Expr *provenance;
4293 List *rv_cmps;
4294
4295 /* Single unified pass over WHERE: each top-level conjunct is
4296 * routed to the right evaluation site (HAVING for agg_token,
4297 * the returned rv_cmps list for random_variable, left in WHERE
4298 * otherwise). Mixed shapes raise a clear error. Replaces the
4299 * historical pair migrate_aggtoken_quals_to_having +
4300 * extract_rv_cmps_from_quals; see the qual_class doc above for
4301 * the routing matrix.
4302 *
4303 * Must run before replace_aggregations_by_provenance_aggregate
4304 * so the lifted RV cmps factor into each row's contribution to
4305 * any surrounding agg_token: otherwise the cmp lands at group
4306 * level with row-typed Vars the executor cannot resolve, or
4307 * gets discarded by the HAVING-replaces-result branch of
4308 * make_provenance_expression.
4309 *
4310 * Skipped for SR_PLUS / SR_MONUS (UNION / EXCEPT outer level):
4311 * each branch is rewritten by its own recursive process_query
4312 * call, so an outer-level WHERE on RV here is exotic; the
4313 * fallback after make_provenance_expression handles it. */
4314 rv_cmps = migrate_probabilistic_quals(constants, q);
4315 if (rv_cmps != NIL && !has_union && !has_difference) {
4316 prov_atts = list_concat(prov_atts, rv_cmps);
4317 rv_cmps = NIL;
4318 }
4319
4320 if (q->hasAggs) {
4321 ListCell *lc_sort;
4322
4323 // Compute aggregation expressions
4325 constants, q, prov_atts,
4326 has_union ? SR_PLUS : (has_difference ? SR_MONUS : SR_TIMES));
4327
4328 // If there are any sort clauses on something whose type is now
4329 // aggregate token, we throw an error: sorting aggregation values
4330 // when provenance is captured is ill-defined
4331 foreach (lc_sort, q->sortClause) {
4332 SortGroupClause *sort = (SortGroupClause *)lfirst(lc_sort);
4333 ListCell *lc_te;
4334 foreach (lc_te, q->targetList) {
4335 TargetEntry *te = (TargetEntry *)lfirst(lc_te);
4336 if (sort->tleSortGroupRef == te->ressortgroupref) {
4337 if (exprType((Node *)te->expr) == constants->OID_TYPE_AGG_TOKEN)
4338 provsql_error("ORDER BY on the result of an aggregate function is "
4339 "not supported");
4340 break;
4341 }
4342 }
4343 }
4344 }
4345
4346 /* Insert casts for agg_token Vars used in arithmetic or window
4347 * functions, now that WHERE-to-HAVING migration is done */
4348 insert_agg_token_casts(constants, q);
4349
4351 constants, q, prov_atts, q->hasAggs, group_by_rewrite,
4352 has_union ? SR_PLUS : (has_difference ? SR_MONUS : SR_TIMES), columns,
4353 nbcols, wrap_root);
4354
4355 /* Fallback for the rare set-op outer WHERE case: conjoin via
4356 * provenance_times after the aggregation wrappers. Correct only
4357 * when no aggregation collapses rows above this point. */
4358 if (rv_cmps != NIL) {
4359 FuncExpr *times = makeNode(FuncExpr);
4360 ArrayExpr *array = makeNode(ArrayExpr);
4361 times->funcid = constants->OID_FUNCTION_PROVENANCE_TIMES;
4362 times->funcresulttype = constants->OID_TYPE_UUID;
4363 times->funcvariadic = true;
4364 times->location = -1;
4365 array->array_typeid = constants->OID_TYPE_UUID_ARRAY;
4366 array->element_typeid = constants->OID_TYPE_UUID;
4367 array->elements = lcons(provenance, rv_cmps);
4368 array->location = -1;
4369 times->args = list_make1(array);
4370 provenance = (Expr *)times;
4371 }
4372
4375
4376 if (has_difference)
4377 add_select_non_zero(constants, q, provenance);
4378 }
4379
4380 for (i = 0; i < q->rtable->length; ++i) {
4381 if (columns[i])
4382 pfree(columns[i]);
4383 }
4384 }
4385
4386 if (provsql_verbose >= 50)
4387 elog_node_display(NOTICE, "ProvSQL: After query rewriting", q, true);
4388
4389 return q;
4390}
4391
4392/* -------------------------------------------------------------------------
4393 * INSERT ... SELECT provenance propagation
4394 * ------------------------------------------------------------------------- */
4395
4396/**
4397 * @brief Propagate provenance through INSERT ... SELECT.
4398 *
4399 * If the source SELECT involves provenance-tracked tables and the target
4400 * table has a provsql column, rewrites the source SELECT to carry
4401 * provenance and maps its provsql output to the target's provsql column,
4402 * replacing the default uuid_generate_v4().
4403 *
4404 * If the target has no provsql column, emits a warning instead.
4405 */
4406static void process_insert_select(const constants_t *constants, Query *q) {
4407 ListCell *lc;
4408 Index src_rteid = 0;
4409 RangeTblEntry *src_rte = NULL;
4410 RangeTblEntry *tgt_rte;
4411 AttrNumber provsql_attno = 0;
4412 TargetEntry *provsql_te = NULL;
4413
4414 /* Find the source SELECT subquery with provenance */
4415 foreach (lc, q->rtable) {
4416 RangeTblEntry *r = (RangeTblEntry *)lfirst(lc);
4417 ++src_rteid;
4418 if (r->rtekind == RTE_SUBQUERY && r->subquery &&
4419 has_provenance(constants, r->subquery)) {
4420 src_rte = r;
4421 break;
4422 }
4423 }
4424
4425 if (src_rte == NULL)
4426 return;
4427
4428 /* Check if the target table has a provsql column */
4429 tgt_rte = list_nth_node(RangeTblEntry, q->rtable, q->resultRelation - 1);
4430 if (tgt_rte->rtekind == RTE_RELATION) {
4431 AttrNumber attid = 1;
4432 foreach (lc, tgt_rte->eref->colnames) {
4433 if (!strcmp(strVal(lfirst(lc)), PROVSQL_COLUMN_NAME) &&
4434 get_atttype(tgt_rte->relid, attid) == constants->OID_TYPE_UUID)
4435 provsql_attno = attid;
4436 ++attid;
4437 }
4438 }
4439
4440 if (provsql_attno == 0) {
4441 provsql_warning("INSERT ... SELECT on provenance-tracked "
4442 "tables: source provenance is not propagated "
4443 "to inserted rows");
4444 return;
4445 }
4446
4447 /* Find the provsql target entry and verify it's a UUID default */
4448 foreach (lc, q->targetList) {
4449 TargetEntry *te = (TargetEntry *)lfirst(lc);
4450 if (te->resno == provsql_attno &&
4451 exprType((Node *)te->expr) == constants->OID_TYPE_UUID) {
4452 provsql_te = te;
4453 break;
4454 }
4455 }
4456
4457 if (provsql_te == NULL) {
4458 /* The target's provsql column is not in the INSERT's targetList
4459 * (no DEFAULT on the column since 1.6.0; the user did not name
4460 * the column either). Synthesise a TE here so we have something
4461 * to substitute the source provsql Var into below. */
4462 provsql_te = makeNode(TargetEntry);
4463 provsql_te->resno = provsql_attno;
4464 provsql_te->resname = pstrdup(PROVSQL_COLUMN_NAME);
4465 /* expr is set to the source Var by the substitution block below. */
4466 q->targetList = lappend(q->targetList, provsql_te);
4467 }
4468
4469 /* Rewrite the source SELECT to carry provenance */
4470 {
4471 bool *removed = NULL;
4472 Query *new_subquery = process_query(constants, src_rte->subquery, &removed, false);
4473 AttrNumber src_provsql_attno = 0;
4474
4475 if (new_subquery == NULL)
4476 return;
4477
4478 src_rte->subquery = new_subquery;
4479
4480 /* Find the provsql column in the rewritten subquery, verify its type */
4481 foreach (lc, new_subquery->targetList) {
4482 TargetEntry *te = (TargetEntry *)lfirst(lc);
4483 if (te->resname && !strcmp(te->resname, PROVSQL_COLUMN_NAME) &&
4484 exprType((Node *)te->expr) == constants->OID_TYPE_UUID) {
4485 src_provsql_attno = te->resno;
4486 break;
4487 }
4488 }
4489
4490 if (src_provsql_attno == 0)
4491 return;
4492
4493 /* Replace the target's provsql default with a Var from the source */
4494 {
4495 Var *v = makeNode(Var);
4496 v->varno = src_rteid;
4497 v->varattno = src_provsql_attno;
4498 v->vartype = constants->OID_TYPE_UUID;
4499 v->vartypmod = -1;
4500 v->varcollid = InvalidOid;
4501 v->location = -1;
4502 provsql_te->expr = (Expr *)v;
4503 }
4504
4505 /* Update the subquery RTE's column names to include provsql */
4506 src_rte->eref->colnames = lappend(src_rte->eref->colnames,
4507 makeString(pstrdup(PROVSQL_COLUMN_NAME)));
4508 }
4509}
4510
4511/* -------------------------------------------------------------------------
4512 * Planner hook & extension lifecycle
4513 * ------------------------------------------------------------------------- */
4514
4515/**
4516 * @brief PostgreSQL planner hook — entry point for provenance rewriting.
4517 *
4518 * Replaces (or chains after) the standard planner. For every CMD_SELECT
4519 * that involves at least one provenance-bearing relation or an explicit
4520 * @c provenance() call, rewrites the query via @c process_query before
4521 * handing the result to the standard planner. Non-SELECT commands and
4522 * queries without provenance are passed through unchanged.
4523 * @param q The query to plan.
4524 * @param cursorOptions Cursor options bitmask.
4525 * @param boundParams Pre-bound parameter values.
4526 * @return The planned statement.
4527 */
4528/** @brief Executor nesting depth.
4529 *
4530 * Tracks how deep we are inside @c Executor invocations. Incremented
4531 * in @c provsql_executor_start, decremented in @c provsql_executor_end.
4532 * The classifier @c NOTICE only fires when this is zero, which
4533 * corresponds to the user's outermost statement being planned (before
4534 * any executor entry). Plans built for PL/pgSQL function bodies that
4535 * the rewriter inserts -- @c provenance_times, @c provenance_plus,
4536 * @c provenance_aggregate, ... -- happen during execution of the
4537 * user's plan, so they see depth >= 1 and skip the NOTICE. */
4539
4540static PlannedStmt *provsql_planner(Query *q,
4541#if PG_VERSION_NUM >= 130000
4542 const char *query_string,
4543#endif
4544 int cursorOptions,
4545 ParamListInfo boundParams) {
4546 if (q->commandType == CMD_INSERT && q->rtable) {
4547 const constants_t constants = get_constants(false);
4548 if (constants.ok)
4549 process_insert_select(&constants, q);
4550 } else if (q->commandType == CMD_SELECT) {
4551 /* No rtable check here: a FROM-less SELECT (e.g.
4552 * SELECT 1 WHERE normal(0,1) > 2)
4553 * still needs the hook to engage when the WHERE contains an
4554 * rv_cmp. has_provenance walks the tree and returns false fast
4555 * on FROM-less queries that have neither rv_cmp nor provenance(),
4556 * so widening the gate costs nothing in the common case. */
4557 const constants_t constants = get_constants(false);
4558
4559 /* Query-time TID / BID / OPAQUE classifier. Emits a NOTICE for
4560 * the user's outermost SELECT when the GUC is on. Runs on the
4561 * user's original Query before any provsql rewriting so the
4562 * reported kind reflects the SQL the user wrote. Gating on
4563 * @c provsql_executor_depth @c == @c 0 skips the spurious extra
4564 * planning calls triggered by PL/pgSQL function bodies the
4565 * rewriter inserts (@c provenance_times, ...), whose internal
4566 * SELECTs go through the planner hook during execution of the
4567 * user's plan. */
4568 if (provsql_executor_depth == 0
4570 && q->rtable != NIL) {
4572 provsql_classify_query(q, &cls);
4574 list_free(cls.source_relids);
4575 }
4576
4577 if (constants.ok && has_provenance(&constants, q)) {
4578 bool *removed = NULL;
4579 Query *new_query;
4580 clock_t begin = 0;
4581
4582#if PG_VERSION_NUM >= 150000
4583 if (provsql_verbose >= 20)
4584 provsql_notice("Main query before query rewriting:\n%s\n",
4585 pg_get_querydef(q, true));
4586#endif
4587
4588 if (provsql_verbose >= 40)
4589 begin = clock();
4590
4591 new_query = process_query(&constants, q, &removed, false);
4592
4593 if (provsql_verbose >= 40)
4594 provsql_notice("planner time spent=%f",
4595 (double)(clock() - begin) / CLOCKS_PER_SEC);
4596
4597 if (new_query != NULL)
4598 q = new_query;
4599
4600#if PG_VERSION_NUM >= 150000
4601 if (provsql_verbose >= 20)
4602 provsql_notice("Main query after query rewriting:\n%s\n",
4603 pg_get_querydef(q, true));
4604#endif
4605 }
4606 }
4607
4608 if (prev_planner)
4609 return prev_planner(q,
4610#if PG_VERSION_NUM >= 130000
4611 query_string,
4612#endif
4613 cursorOptions, boundParams);
4614 else
4615 return standard_planner(q,
4616#if PG_VERSION_NUM >= 130000
4617 query_string,
4618#endif
4619 cursorOptions, boundParams);
4620}
4621
4622/* -------------------------------------------------------------------------
4623 * Executor hooks (depth tracking only)
4624 *
4625 * We install ExecutorStart / ExecutorEnd hooks solely to maintain
4626 * @c provsql_executor_depth, which the classifier in @c provsql_planner
4627 * consults to distinguish the user's outermost statement from nested
4628 * PL/pgSQL bodies the rewriter calls into. No other behaviour changes.
4629 * ------------------------------------------------------------------------- */
4630static ExecutorStart_hook_type prev_ExecutorStart = NULL;
4631static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
4632
4633static void provsql_executor_start(QueryDesc *queryDesc, int eflags) {
4635 PG_TRY();
4636 {
4638 prev_ExecutorStart(queryDesc, eflags);
4639 else
4640 standard_ExecutorStart(queryDesc, eflags);
4641 }
4642 PG_CATCH();
4643 {
4645 PG_RE_THROW();
4646 }
4647 PG_END_TRY();
4648}
4649
4650static void provsql_executor_end(QueryDesc *queryDesc) {
4651#if PG_VERSION_NUM >= 130000
4652 PG_TRY();
4653 {
4654 if (prev_ExecutorEnd)
4655 prev_ExecutorEnd(queryDesc);
4656 else
4657 standard_ExecutorEnd(queryDesc);
4658 }
4659 PG_FINALLY();
4660 {
4662 }
4663 PG_END_TRY();
4664#else
4665 /* PG < 13 lacks PG_FINALLY: emulate by running the cleanup on the
4666 * error path (via PG_CATCH + PG_RE_THROW) and on the success path
4667 * (after PG_END_TRY). Functionally equivalent. */
4668 PG_TRY();
4669 {
4670 if (prev_ExecutorEnd)
4671 prev_ExecutorEnd(queryDesc);
4672 else
4673 standard_ExecutorEnd(queryDesc);
4674 }
4675 PG_CATCH();
4676 {
4678 PG_RE_THROW();
4679 }
4680 PG_END_TRY();
4682#endif
4683}
4684
4685/* -------------------------------------------------------------------------
4686 * ProcessUtility hook: CTAS lineage inheritance.
4687 *
4688 * When a @c CREATE @c TABLE @c AS (or @c CREATE @c MATERIALIZED @c VIEW,
4689 * or @c SELECT @c INTO -- PG's parser transforms all three into
4690 * @c CreateTableAsStmt) projects a @c provsql column lifted verbatim
4691 * from a tracked source, the resulting relation's atoms are not freshly
4692 * minted UUIDs but lineage tokens of one or more base @c
4693 * add_provenance / @c repair_key relations. The hook intercepts the
4694 * utility statement, classifies the inner @c SELECT via
4695 * @c provsql_classify_query, lets PG run the CTAS, then populates
4696 * @c provsql_table_info (with the inherited @c kind / BID @c block_key)
4697 * and the ancestor registry (with the transitive union of source
4698 * ancestor sets) on the just-created relation. A @c provenance_guard
4699 * trigger is installed on the new table so any subsequent INSERT /
4700 * UPDATE that supplies a non-NULL @c provsql still flips the table to
4701 * OPAQUE the standard way.
4702 *
4703 * The hook deliberately fires only when the inner @c SELECT projects
4704 * a @c provsql column from a tracked source -- otherwise the new
4705 * relation has no @c provsql column and the lineage metadata would be
4706 * operationally pointless. Users who want a tracked CTAS-derived
4707 * table without inherited lineage still call @c add_provenance on it
4708 * afterwards (that path seeds @c {self} and overrides whatever this
4709 * hook may have recorded).
4710 * ------------------------------------------------------------------------- */
4711
4712static ProcessUtility_hook_type prev_ProcessUtility = NULL;
4713
4714/** @brief State captured by the pre-execution pass for the post-execution one. */
4715typedef struct ProvSQLCtasCapture {
4716 bool fire; ///< true when the post-pass should run
4717 Query *inner_query; ///< cloned for safety; freed by pfree on completion
4721 Oid source_relid; ///< Single source whose block_key we want to align (BID only)
4725
4726/**
4727 * @brief Decide whether @p parsetree is a CTAS that should trigger
4728 * the ancestry hook, and if so populate @p cap with the inner
4729 * classification, the (single) source's block-key columns, and
4730 * the transitive ancestor union.
4731 *
4732 * Fires only when the inner @c SELECT's target list projects a base-
4733 * level @c Var (possibly through @c RelabelType wrappers) that
4734 * resolves to the @c provsql column of an @c RTE_RELATION whose
4735 * metadata is non-OPAQUE. Anything else (no @c provsql in the
4736 * projection, classifier says OPAQUE, the projected source is itself
4737 * OPAQUE) leaves @c cap->fire false and the post-pass becomes a
4738 * no-op.
4739 */
4740static void provsql_ProcessUtility_capture(Node *parsetree,
4741 ProvSQLCtasCapture *cap) {
4742 CreateTableAsStmt *stmt;
4743 Query *qry;
4745 ListCell *lc;
4746 AttrNumber prov_resno = InvalidAttrNumber;
4747 Oid source_relid = InvalidOid;
4748 ProvenanceTableInfo source_info;
4749 Bitmapset *ancestor_bms = NULL;
4750 int bms_member;
4751 uint16 ancestor_n;
4752
4753 cap->fire = false;
4754 if (!provsql_active)
4755 return;
4756 if (parsetree == NULL || !IsA(parsetree, CreateTableAsStmt))
4757 return;
4758 stmt = (CreateTableAsStmt *) parsetree;
4759 if (stmt->query == NULL || !IsA(stmt->query, Query))
4760 return;
4761 qry = (Query *) stmt->query;
4762 if (qry->commandType != CMD_SELECT)
4763 return;
4764
4765 provsql_classify_query(qry, &cls);
4766 if (cls.kind == PROVSQL_TABLE_OPAQUE) {
4767 list_free(cls.source_relids);
4768 return;
4769 }
4770
4771 /* Walk the inner target list for a TLE whose Var resolves to the
4772 * provsql column of a tracked, non-OPAQUE source. First match wins
4773 * (CTAS preserves the TLE's column name verbatim in the new
4774 * table, so a single provsql TLE is the normal case). */
4775 foreach (lc, qry->targetList) {
4776 TargetEntry *te = (TargetEntry *) lfirst(lc);
4777 Node *e = (Node *) te->expr;
4778 Var *v;
4779 RangeTblEntry *rte;
4780 AttrNumber prov_attno;
4781
4782 if (te->resjunk)
4783 continue;
4784 while (e != NULL && IsA(e, RelabelType))
4785 e = (Node *) ((RelabelType *) e)->arg;
4786 if (e == NULL || !IsA(e, Var))
4787 continue;
4788 v = (Var *) e;
4789 if (v->varlevelsup != 0)
4790 continue;
4791 if (v->varno < 1 || (int) v->varno > list_length(qry->rtable))
4792 continue;
4793 rte = (RangeTblEntry *) list_nth(qry->rtable, v->varno - 1);
4794 if (rte->rtekind != RTE_RELATION)
4795 continue;
4796 prov_attno = get_attnum(rte->relid, PROVSQL_COLUMN_NAME);
4797 if (prov_attno == InvalidAttrNumber || v->varattno != prov_attno)
4798 continue;
4799 if (!provsql_lookup_table_info(rte->relid, &source_info))
4800 continue;
4801 if (source_info.kind == PROVSQL_TABLE_OPAQUE)
4802 continue;
4803 prov_resno = te->resno;
4804 source_relid = rte->relid;
4805 break;
4806 }
4807 if (prov_resno == InvalidAttrNumber) {
4808 list_free(cls.source_relids);
4809 return;
4810 }
4811
4812 /* Transitive ancestor union: lookup each classifier-reported source's
4813 * registered ancestry, fall back to {source} when none recorded
4814 * (defensive: the SQL add_provenance / repair_key seed should always
4815 * give us a non-empty set). Bitmapset dedupes; we then walk it in
4816 * ascending order to get the sorted Oid array the registry stores. */
4817 foreach (lc, cls.source_relids) {
4818 Oid src_relid = lfirst_oid(lc);
4819 uint16 src_n;
4820 Oid src_ancestors[PROVSQL_TABLE_INFO_MAX_ANCESTORS];
4821 if (provsql_lookup_ancestry(src_relid, &src_n, src_ancestors)) {
4822 for (uint16 i = 0; i < src_n; ++i)
4823 ancestor_bms = bms_add_member(ancestor_bms, (int) src_ancestors[i]);
4824 } else {
4825 ancestor_bms = bms_add_member(ancestor_bms, (int) src_relid);
4826 }
4827 }
4828 list_free(cls.source_relids);
4829
4830 ancestor_n = 0;
4831 bms_member = -1;
4832 while ((bms_member = bms_next_member(ancestor_bms, bms_member)) >= 0) {
4833 if (ancestor_n >= PROVSQL_TABLE_INFO_MAX_ANCESTORS) {
4834 /* Cap exceeded: refuse to fire rather than truncate the ancestor
4835 * set silently (a partial set would let the safe-query
4836 * disjointness check accept a join that shouldn't be safe). */
4837 bms_free(ancestor_bms);
4838 return;
4839 }
4840 cap->ancestors[ancestor_n++] = (Oid) bms_member;
4841 }
4842 bms_free(ancestor_bms);
4843
4844 cap->fire = true;
4845 cap->inner_query = qry;
4847 cap->ancestor_n = ancestor_n;
4848 cap->source_relid = source_relid;
4849 cap->source_block_key_n = source_info.block_key_n;
4850 memcpy(cap->source_block_key, source_info.block_key,
4851 source_info.block_key_n * sizeof(AttrNumber));
4852}
4853
4854/** @brief Map @c provsql_table_kind to its textual label
4855 * (@c set_table_info accepts text). */
4857 switch (k) {
4858 case PROVSQL_TABLE_TID: return "tid";
4859 case PROVSQL_TABLE_BID: return "bid";
4860 case PROVSQL_TABLE_OPAQUE: return "opaque";
4861 }
4862 return "opaque";
4863}
4864
4865/** @brief Forward declaration of the C SQL entry points. */
4866extern Datum set_table_info(PG_FUNCTION_ARGS);
4867extern Datum set_ancestors(PG_FUNCTION_ARGS);
4868
4869/**
4870 * @brief Apply @p cap to the freshly-created relation @c stmt->into->rel.
4871 *
4872 * For BID sources: walks the inner query's target list to align each
4873 * source block-key column to its output @c resno. If any block-key
4874 * column is missing from the projection (the CTAS dropped it), the
4875 * new relation cannot honour the BID invariant under that column --
4876 * the hook demotes to TID rather than asserting a now-stale block
4877 * key.
4878 *
4879 * Installs @c provenance_guard via SPI so subsequent INSERT /
4880 * UPDATE OF provsql on the new relation flip its kind to OPAQUE
4881 * through the standard guard path.
4882 */
4883static void provsql_ProcessUtility_apply(Node *parsetree,
4884 ProvSQLCtasCapture *cap) {
4885 CreateTableAsStmt *stmt;
4886 Oid new_relid;
4887 AttrNumber prov_attno;
4888 provsql_table_kind eff_kind;
4889 uint16 eff_block_key_n = 0;
4890 AttrNumber eff_block_key[PROVSQL_TABLE_INFO_MAX_BLOCK_KEY];
4891 Datum kind_datum;
4892 Datum block_key_datum;
4893 Datum ancestors_datum;
4894 Datum *block_key_elems;
4895 Datum *ancestor_elems;
4896 ArrayType *block_key_arr;
4897 ArrayType *ancestors_arr;
4898 const char *nspname;
4899 const char *relname;
4900 StringInfoData trigger_sql;
4901
4902 if (!cap->fire)
4903 return;
4904 stmt = (CreateTableAsStmt *) parsetree;
4905
4906 new_relid = RangeVarGetRelid(stmt->into->rel, NoLock, true);
4907 if (new_relid == InvalidOid)
4908 return;
4909
4910 /* Confirm the new relation actually has a @c provsql @c uuid column.
4911 * CTAS preserves TLE column names, so this is essentially the
4912 * post-execution verification of what the pre-pass already
4913 * required. */
4914 prov_attno = get_attnum(new_relid, PROVSQL_COLUMN_NAME);
4915 if (prov_attno == InvalidAttrNumber)
4916 return;
4917 if (get_atttype(new_relid, prov_attno) != UUIDOID)
4918 return;
4919
4920 /* BID block-key alignment: each source block-key column must
4921 * survive in the inner-query target list. When all do, the new
4922 * relation's effective block key is the corresponding output
4923 * resno. When any is missing, demote to TID (a partial block
4924 * key would falsely advertise mutual exclusion the rows no
4925 * longer have). */
4926 eff_kind = cap->inherited_kind;
4927 if (eff_kind == PROVSQL_TABLE_BID) {
4928 bool ok = true;
4929 for (uint16 i = 0; i < cap->source_block_key_n; ++i) {
4930 AttrNumber src_attno = cap->source_block_key[i];
4931 ListCell *lc;
4932 bool found = false;
4933 foreach (lc, cap->inner_query->targetList) {
4934 TargetEntry *te = (TargetEntry *) lfirst(lc);
4935 Node *e = (Node *) te->expr;
4936 Var *v;
4937 RangeTblEntry *rte;
4938 if (te->resjunk)
4939 continue;
4940 while (e != NULL && IsA(e, RelabelType))
4941 e = (Node *) ((RelabelType *) e)->arg;
4942 if (e == NULL || !IsA(e, Var))
4943 continue;
4944 v = (Var *) e;
4945 if (v->varlevelsup != 0)
4946 continue;
4947 if (v->varno < 1
4948 || (int) v->varno > list_length(cap->inner_query->rtable))
4949 continue;
4950 rte = (RangeTblEntry *)
4951 list_nth(cap->inner_query->rtable, v->varno - 1);
4952 if (rte->rtekind != RTE_RELATION)
4953 continue;
4954 if (rte->relid == cap->source_relid
4955 && v->varattno == src_attno) {
4956 if (eff_block_key_n >= PROVSQL_TABLE_INFO_MAX_BLOCK_KEY) {
4957 ok = false;
4958 break;
4959 }
4960 eff_block_key[eff_block_key_n++] = te->resno;
4961 found = true;
4962 break;
4963 }
4964 }
4965 if (!found) {
4966 ok = false;
4967 break;
4968 }
4969 }
4970 if (!ok) {
4971 eff_kind = PROVSQL_TABLE_TID;
4972 eff_block_key_n = 0;
4973 }
4974 }
4975
4976 /* Marshal arguments and invoke the SQL-level helpers via
4977 * DirectFunctionCall: this reaches the worker through the same IPC
4978 * path the user-facing SQL functions use, including the relcache
4979 * invalidation broadcast on the way out. */
4980 kind_datum = CStringGetTextDatum(provsql_ctas_kind_label(eff_kind));
4981 if (eff_block_key_n == 0) {
4982 block_key_arr = construct_empty_array(INT2OID);
4983 } else {
4984 block_key_elems = palloc(eff_block_key_n * sizeof(Datum));
4985 for (uint16 i = 0; i < eff_block_key_n; ++i)
4986 block_key_elems[i] = Int16GetDatum(eff_block_key[i]);
4987 block_key_arr = construct_array(block_key_elems, eff_block_key_n,
4988 INT2OID, 2, true, 's');
4989 pfree(block_key_elems);
4990 }
4991 block_key_datum = PointerGetDatum(block_key_arr);
4992 DirectFunctionCall3(set_table_info,
4993 ObjectIdGetDatum(new_relid),
4994 kind_datum,
4995 block_key_datum);
4996
4997 if (cap->ancestor_n == 0) {
4998 ancestors_arr = construct_empty_array(OIDOID);
4999 } else {
5000 ancestor_elems = palloc(cap->ancestor_n * sizeof(Datum));
5001 for (uint16 i = 0; i < cap->ancestor_n; ++i)
5002 ancestor_elems[i] = ObjectIdGetDatum(cap->ancestors[i]);
5003 ancestors_arr = construct_array(ancestor_elems, cap->ancestor_n,
5004 OIDOID, sizeof(Oid), true, 'i');
5005 pfree(ancestor_elems);
5006 }
5007 ancestors_datum = PointerGetDatum(ancestors_arr);
5008 DirectFunctionCall2(set_ancestors,
5009 ObjectIdGetDatum(new_relid),
5010 ancestors_datum);
5011
5012 /* Install the provenance_guard trigger via SPI. Users who later
5013 * INSERT / UPDATE OF provsql with a non-NULL value will then
5014 * trigger the standard kind flip to OPAQUE; users who omit the
5015 * column on INSERT get a fresh @c uuid_generate_v4 leaf (which
5016 * already disconnects the row from the inherited lineage, but the
5017 * guard prevents the more dangerous shared-UUID aliasing path).
5018 *
5019 * Materialized views are exempt: PG forbids triggers on them, and
5020 * they cannot be modified through DML anyway (only @c REFRESH @c
5021 * MATERIALIZED @c VIEW changes the contents -- which re-runs the
5022 * inner SELECT and the freshly-projected rows continue to carry
5023 * lineage from the same sources). */
5024 /* PG 14 renamed CreateTableAsStmt.relkind -> objtype (same ObjectType,
5025 * same OBJECT_* values; pure field rename). */
5026#if PG_VERSION_NUM >= 140000
5027 if (stmt->objtype == OBJECT_MATVIEW)
5028#else
5029 if (stmt->relkind == OBJECT_MATVIEW)
5030#endif
5031 return;
5032
5033 nspname = get_namespace_name(get_rel_namespace(new_relid));
5034 relname = get_rel_name(new_relid);
5035 if (nspname == NULL || relname == NULL)
5036 return;
5037 initStringInfo(&trigger_sql);
5038 appendStringInfo(&trigger_sql,
5039 "CREATE TRIGGER provenance_guard "
5040 "BEFORE INSERT OR UPDATE OF provsql ON %s.%s "
5041 /* "EXECUTE PROCEDURE" is the legacy form, kept as a valid synonym
5042 * of "EXECUTE FUNCTION" through PG 18 -- matches the rest of the
5043 * codebase and stays PG 10-compatible. Promote when PG 10 drops
5044 * out of the support floor. */
5045 "FOR EACH ROW EXECUTE PROCEDURE provsql.provenance_guard()",
5046 quote_identifier(nspname), quote_identifier(relname));
5047 if (SPI_connect() != SPI_OK_CONNECT)
5048 provsql_error("CTAS lineage hook: SPI_connect failed");
5049 if (SPI_exec(trigger_sql.data, 0) != SPI_OK_UTILITY)
5050 provsql_error("CTAS lineage hook: failed to install provenance_guard "
5051 "on %s.%s", nspname, relname);
5052 SPI_finish();
5053 pfree(trigger_sql.data);
5054}
5055
5057 PlannedStmt *pstmt,
5058 const char *queryString,
5059#if PG_VERSION_NUM >= 140000
5060 bool readOnlyTree,
5061#endif
5062 ProcessUtilityContext context,
5063 ParamListInfo params,
5064 QueryEnvironment *queryEnv,
5065 DestReceiver *dest,
5066#if PG_VERSION_NUM >= 130000
5067 QueryCompletion *qc
5068#else
5069 char *completionTag
5070#endif
5071 ) {
5072 Node *parsetree = pstmt ? pstmt->utilityStmt : NULL;
5073 ProvSQLCtasCapture cap = {0};
5074
5075 provsql_ProcessUtility_capture(parsetree, &cap);
5076
5078 prev_ProcessUtility(pstmt, queryString,
5079#if PG_VERSION_NUM >= 140000
5080 readOnlyTree,
5081#endif
5082 context, params, queryEnv, dest,
5083#if PG_VERSION_NUM >= 130000
5084 qc
5085#else
5086 completionTag
5087#endif
5088 );
5089 else
5090 standard_ProcessUtility(pstmt, queryString,
5091#if PG_VERSION_NUM >= 140000
5092 readOnlyTree,
5093#endif
5094 context, params, queryEnv, dest,
5095#if PG_VERSION_NUM >= 130000
5096 qc
5097#else
5098 completionTag
5099#endif
5100 );
5101
5102 provsql_ProcessUtility_apply(parsetree, &cap);
5103}
5104
5105/**
5106 * @brief Extension initialization – called once when the shared library is loaded.
5107 *
5108 * Registers the GUC variables (@c provsql.active, @c where_provenance,
5109 * @c update_provenance, @c verbose_level, @c aggtoken_text_as_uuid,
5110 * @c tool_search_path), installs the planner hook and shared-memory hooks,
5111 * and launches the background MMap worker.
5112 *
5113 * Must be loaded via @c shared_preload_libraries; raises an error otherwise.
5114 */
5115void _PG_init(void) {
5116 if (!process_shared_preload_libraries_in_progress)
5117 provsql_error("provsql needs to be added to the shared_preload_libraries "
5118 "configuration variable");
5119
5120 DefineCustomBoolVariable("provsql.active",
5121 "Should ProvSQL track provenance?",
5122 "1 is standard ProvSQL behavior, 0 means provsql attributes will be dropped.",
5124 true,
5125 PGC_USERSET,
5126 0,
5127 NULL,
5128 NULL,
5129 NULL);
5130 DefineCustomBoolVariable("provsql.where_provenance",
5131 "Should ProvSQL track where-provenance?",
5132 "1 turns where-provenance on, 0 off.",
5134 false,
5135 PGC_USERSET,
5136 0,
5137 NULL,
5138 NULL,
5139 NULL);
5140 DefineCustomBoolVariable("provsql.update_provenance",
5141 "Should ProvSQL track update provenance?",
5142 "1 turns update provenance on, 0 off.",
5144 false,
5145 PGC_USERSET,
5146 0,
5147 NULL,
5148 NULL,
5149 NULL);
5150 DefineCustomBoolVariable("provsql.aggtoken_text_as_uuid",
5151 "Output agg_token cells as the underlying UUID "
5152 "instead of \"value (*)\".",
5153 "Off by default for psql-friendly output. UI "
5154 "layers (notably ProvSQL Studio) flip this on "
5155 "per session so aggregate cells expose the "
5156 "circuit root UUID for click-through; the "
5157 "display value is recovered via "
5158 "provsql.agg_token_value_text(uuid).",
5160 false,
5161 PGC_USERSET,
5162 0,
5163 NULL,
5164 NULL,
5165 NULL);
5166 DefineCustomIntVariable("provsql.verbose_level",
5167 "Level of verbosity for ProvSQL informational and debug messages",
5168 "0 for quiet (default), 1-9 for informational messages, 10-100 for debug information.",
5170 0,
5171 0,
5172 100,
5173 PGC_USERSET,
5174 1,
5175 NULL,
5176 NULL,
5177 NULL);
5178 DefineCustomStringVariable("provsql.tool_search_path",
5179 "Directories prepended to PATH when ProvSQL spawns external tools.",
5180 "Colon-separated list of directories searched before the server's PATH "
5181 "when locating d4, c2d, minic2d, dsharp, weightmc, or graph-easy. "
5182 "Empty (default) means rely on the server's PATH alone.",
5184 "",
5185 PGC_USERSET,
5186 0,
5187 NULL,
5188 NULL,
5189 NULL);
5190 DefineCustomBoolVariable("provsql.simplify_on_load",
5191 "Apply universal cmp-resolution passes when "
5192 "loading a provenance circuit.",
5193 "When on (default), every GenericCircuit returned "
5194 "by getGenericCircuit goes through RangeCheck "
5195 "(and any future universal pass): comparators "
5196 "decidable to certain Boolean values become "
5197 "Bernoulli gate_input gates with probability 0 "
5198 "or 1, transparent to every downstream consumer "
5199 "(semiring evaluators, MC, view_circuit, PROV "
5200 "export). Set off to inspect raw circuit "
5201 "structure (e.g. when debugging gate-creation "
5202 "paths).",
5204 true,
5205 PGC_USERSET,
5206 0,
5207 NULL,
5208 NULL,
5209 NULL);
5210 /* Debug-only: hidden from SHOW ALL and postgresql.conf.sample.
5211 * On is strictly better for end users (analytic answers where
5212 * possible, lower MC variance, more methods usable on continuous
5213 * circuits); off only serves developer A/B against pure MC and as
5214 * a bisection escape valve if a closure rule misbehaves. */
5215 DefineCustomBoolVariable("provsql.hybrid_evaluation",
5216 "Run the hybrid-evaluator simplifier and "
5217 "island decomposer inside probability_evaluate. "
5218 "Debug only.",
5219 "When on (default), probability_evaluate runs "
5220 "the HybridEvaluator peephole simplifier "
5221 "between RangeCheck and AnalyticEvaluator and "
5222 "the per-cmp MC island decomposer after "
5223 "AnalyticEvaluator. Off bypasses both and lets "
5224 "unresolved comparators fall through to "
5225 "whole-circuit MC. End users have no reason "
5226 "to flip this; it exists for developer A/B "
5227 "testing against the unfolded path and as a "
5228 "bisection knob if a closure rule turns out "
5229 "to be unsound on some workload.",
5231 true,
5232 PGC_USERSET,
5233 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
5234 NULL,
5235 NULL,
5236 NULL);
5237 /* Debug-only: hidden from SHOW ALL and postgresql.conf.sample.
5238 * Umbrella for closed-form / analytic resolution of gate_cmp
5239 * probabilities in probability_evaluate (currently the
5240 * Poisson-binomial HAVING-COUNT pre-pass; future MIN / MAX / SUM
5241 * pre-passes gate on the same flag). On is strictly better for
5242 * end users (each resolver replaces an exponential DNF
5243 * construction with O(N) or O(N x C) arithmetic); off only serves
5244 * developer A/B against the unoptimised enumerate_valid_worlds
5245 * path and as a bisection escape valve. */
5246 DefineCustomBoolVariable("provsql.cmp_probability_evaluation",
5247 "Run closed-form / analytic probability "
5248 "evaluators for gate_cmps inside "
5249 "probability_evaluate. Debug only.",
5250 "When on (default), probability_evaluate "
5251 "runs pre-passes that recognise specific "
5252 "gate_cmp shapes (currently HAVING COUNT(*) "
5253 "op C over distinct gate_input leaves) and "
5254 "replace each cmp with a Bernoulli "
5255 "gate_input carrying the closed-form "
5256 "probability, bypassing the DNF that "
5257 "provsql_having's enumerate_valid_worlds "
5258 "would otherwise emit. Off forces the cmp "
5259 "to fall through to that enumeration path. "
5260 "Future MIN / MAX / SUM probability "
5261 "evaluators will gate on the same flag. "
5262 "End users have no reason to flip this; it "
5263 "exists for developer A/B testing and as a "
5264 "bisection escape valve.",
5266 true,
5267 PGC_USERSET,
5268 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE,
5269 NULL,
5270 NULL,
5271 NULL);
5272 DefineCustomBoolVariable("provsql.boolean_provenance",
5273 "Opt-in safe-query optimisation for hierarchical conjunctive queries.",
5274 "When on, the planner rewrites self-join-free "
5275 "hierarchical conjunctive queries (and independent "
5276 "UCQs) over TID/BID tables to a read-once form "
5277 "whose probability is computable in linear time "
5278 "via the existing BooleanCircuit independent "
5279 "evaluator. The rewrite preserves the Boolean "
5280 "polynomial of the existential lineage, so any "
5281 "evaluation that factors through a homomorphism "
5282 "from Boolean functions remains sound (notably "
5283 "probability and Shapley / Banzhaf). The "
5284 "resulting root gate is tagged; semiring "
5285 "evaluators are individually marked at compile "
5286 "time as compatible or not with this rewrite, "
5287 "and incompatible evaluators refuse to run on a "
5288 "tagged circuit. Off by default because the "
5289 "rewrite changes the multiset of result rows and "
5290 "is therefore unsound for per-row provenance "
5291 "interrogations and aggregation queries.",
5293 false,
5294 PGC_USERSET,
5295 0,
5296 NULL,
5297 NULL,
5298 NULL);
5299 DefineCustomBoolVariable("provsql.classify_top_level",
5300 "Emit a NOTICE classifying each top-level SELECT.",
5301 "When on, every top-level SELECT that "
5302 "touches a relation triggers a NOTICE of "
5303 "the form `ProvSQL: query result is "
5304 "<KIND> (sources: ...)` where <KIND> is "
5305 "TID, BID, or OPAQUE under the existing "
5306 "provsql_table_kind taxonomy and the "
5307 "sources list names the provenance-"
5308 "tracked base relations the query touches. "
5309 "Read-only : the classifier does not "
5310 "rewrite the query. Studio reads the "
5311 "NOTICE to label query results with their "
5312 "certified kind.",
5314 false,
5315 PGC_USERSET,
5316 0,
5317 NULL,
5318 NULL,
5319 NULL);
5320 DefineCustomIntVariable("provsql.monte_carlo_seed",
5321 "Seed for the Monte Carlo sampler.",
5322 "-1 (default) seeds from std::random_device for "
5323 "non-deterministic sampling. Any other value "
5324 "(including 0) is used as a literal seed for "
5325 "std::mt19937_64, making "
5326 "probability_evaluate(..., 'monte-carlo', n) "
5327 "reproducible across runs and across the Bernoulli "
5328 "and continuous (gate_rv) sampling paths.",
5330 -1,
5331 -1,
5332 INT_MAX,
5333 PGC_USERSET,
5334 0,
5335 NULL,
5336 NULL,
5337 NULL);
5338 DefineCustomIntVariable("provsql.rv_mc_samples",
5339 "Default sample count for analytical-evaluator MC fallbacks.",
5340 "Used when an analytical evaluator (Expectation, "
5341 "future hybrid evaluator, etc.) cannot decompose a "
5342 "sub-circuit and needs to fall back to Monte Carlo. "
5343 "Default 10000. Set to 0 to disable the fallback "
5344 "entirely: callers raise an exception rather than "
5345 "sampling, which is useful when only analytical "
5346 "answers are acceptable. Unrelated to "
5347 "probability_evaluate(..., 'monte-carlo', n) where "
5348 "the sample count is an explicit argument.",
5350 10000,
5351 0,
5352 INT_MAX,
5353 PGC_USERSET,
5354 0,
5355 NULL,
5356 NULL,
5357 NULL);
5358
5359 // Emit warnings for undeclared provsql.* configuration parameters
5360 EmitWarningsOnPlaceholders("provsql");
5361
5362 prev_planner = planner_hook;
5363 prev_shmem_startup = shmem_startup_hook;
5364 prev_ExecutorStart = ExecutorStart_hook;
5365 prev_ExecutorEnd = ExecutorEnd_hook;
5366 prev_ProcessUtility = ProcessUtility_hook;
5367#if (PG_VERSION_NUM >= 150000)
5368 prev_shmem_request = shmem_request_hook;
5369 shmem_request_hook = provsql_shmem_request;
5370#else
5372#endif
5373
5374 planner_hook = provsql_planner;
5375 shmem_startup_hook = provsql_shmem_startup;
5376 ExecutorStart_hook = provsql_executor_start;
5377 ExecutorEnd_hook = provsql_executor_end;
5378 ProcessUtility_hook = provsql_ProcessUtility;
5379
5381}
5382
5383/**
5384 * @brief Extension teardown — restores the planner and shmem hooks.
5385 */
5386void _PG_fini(void) {
5387 planner_hook = prev_planner;
5388 shmem_startup_hook = prev_shmem_startup;
5389 ExecutorStart_hook = prev_ExecutorStart;
5390 ExecutorEnd_hook = prev_ExecutorEnd;
5391 ProcessUtility_hook = prev_ProcessUtility;
5392}
#define PROVSQL_TABLE_INFO_MAX_BLOCK_KEY
Cap on the number of block-key columns recorded per relation.
provsql_table_kind
How the provenance leaves of a tracked relation are correlated.
@ PROVSQL_TABLE_TID
@ PROVSQL_TABLE_BID
@ PROVSQL_TABLE_OPAQUE
#define PROVSQL_TABLE_INFO_MAX_ANCESTORS
Cap on the number of base ancestors recorded per relation.
bool provsql_classify_top_level
Backing storage for the provsql.classify_top_level GUC.
void provsql_classify_emit_notice(const ProvSQLClassification *c)
Render the result of provsql_classify_query as a NOTICE.
void provsql_classify_query(Query *q, ProvSQLClassification *out)
Classify the result relation of a parsed top-level Query.
Public surface of the query-time TID / BID / OPAQUE classifier.
List * list_insert_nth(List *list, int pos, void *datum)
Insert datum at position pos in list (PG < 13 backport).
PostgreSQL cross-version compatibility shims for ProvSQL.
#define F_SUM_INT4
OID of sum(int4) aggregate function (pre-PG 14).
static List * my_list_delete_cell(List *list, ListCell *cell, ListCell *prev)
Version-agnostic wrapper around list_delete_cell().
static ListCell * my_lnext(const List *l, const ListCell *c)
Version-agnostic wrapper around lnext().
#define F_COUNT_
OID of count() aggregate function (pre-PG 14).
#define F_COUNT_ANY
OID of count(*) / count(any) aggregate function (pre-PG 14).
Datum provenance(PG_FUNCTION_ARGS)
Error stub for provsql.provenance() on untracked tables.
Definition provenance.c:31
static void transform_distinct_into_group_by(Query *q)
Convert a SELECT DISTINCT into an equivalent GROUP BY.
Definition provsql.c:2672
static const char * provsql_ctas_kind_label(provsql_table_kind k)
Map provsql_table_kind to its textual label (set_table_info accepts text).
Definition provsql.c:4856
static void remove_provenance_attribute_groupref(Query *q, const Bitmapset *removed_sortgrouprefs)
Remove sort/group references that belonged to removed provenance columns.
Definition provsql.c:2705
static qual_class classify_qual(Expr *expr, const constants_t *constants)
Classify expr along the qual_class axis.
Definition provsql.c:3679
static FuncExpr * having_Expr_to_provenance_cmp(Expr *expr, const constants_t *constants, bool negated)
Dispatch a HAVING sub-expression to the appropriate converter.
Definition provsql.c:1204
static void maybe_cast_agg_token_args(List *args, Oid parent_funcid, const constants_t *constants)
Cast provenance_aggregate arguments of an operator or function when the formal parameter type require...
Definition provsql.c:2394
Datum set_ancestors(PG_FUNCTION_ARGS)
PostgreSQL-callable wrapper for setTableAncestry() over the IPC pipe.
static bool transform_except_into_join(const constants_t *constants, Query *q)
Rewrite an EXCEPT query into a LEFT JOIN with monus provenance.
Definition provsql.c:3179
bool provsql_where_provenance
Global variable that indicates if where-provenance support has been activated through the provsql....
Definition provsql.c:76
static bool has_provenance_walker(Node *node, void *data)
Definition provsql.c:2967
static bool has_rv_or_provenance_call(Node *node, void *data)
Tree walker that detects any provenance-bearing relation or provenance() call.
Definition provsql.c:2937
int provsql_verbose
Verbosity level; controlled by the provsql.verbose_level GUC.
Definition provsql.c:78
static bool aggtoken_walker(Node *node, void *data)
Tree walker that detects any Var of type agg_token.
Definition provsql.c:3078
static bool check_selection_on_aggregate(OpExpr *op, const constants_t *constants)
Check whether op is a supported comparison on an aggregate result.
Definition provsql.c:3485
void _PG_init(void)
Extension initialization – called once when the shared library is loaded.
Definition provsql.c:5115
bool provsql_simplify_on_load
Run universal cmp-resolution passes when getGenericCircuit returns; controlled by the provsql....
Definition provsql.c:83
static FuncExpr * having_BoolExpr_to_provenance(BoolExpr *be, const constants_t *constants, bool negated)
Convert a Boolean combination of HAVING comparisons into a provenance_times / provenance_plus gate ex...
Definition provsql.c:1154
static bool is_target_agg_var(Node *node, aggregation_type_mutator_context *context)
Check if a Var matches the target aggregate column.
Definition provsql.c:217
static bool has_aggtoken(Node *node, const constants_t *constants)
Return true if node contains a Var of type agg_token.
Definition provsql.c:3102
static int rv_cmp_index(const constants_t *constants, Oid funcoid)
Test whether funcoid is one of the random_variable_* comparison procedures, and if so return its Comp...
Definition provsql.c:1242
static void replace_provenance_function_by_expression(const constants_t *constants, Query *q, Expr *provsql)
Replace every explicit provenance() call in q with provsql.
Definition provsql.c:2648
static Node * reduce_varattno_mutator(Node *node, void *ctx)
Tree-mutator callback that adjusts Var attribute numbers.
Definition provsql.c:168
static void provsql_ProcessUtility_capture(Node *parsetree, ProvSQLCtasCapture *cap)
Decide whether parsetree is a CTAS that should trigger the ancestry hook, and if so populate cap with...
Definition provsql.c:4740
static List * get_provenance_attributes(const constants_t *constants, Query *q)
Collect all provenance Var nodes reachable from q's range table.
Definition provsql.c:420
static bool check_expr_on_rv(Expr *expr, const constants_t *constants)
Test whether expr is a Boolean combination of only random_variable comparisons (no other leaves allow...
Definition provsql.c:1446
PG_MODULE_MAGIC
Required PostgreSQL extension magic block.
Definition provsql.c:68
static void insert_agg_token_casts(const constants_t *constants, Query *q)
Walk query and insert agg_token casts where needed.
Definition provsql.c:3976
Datum set_table_info(PG_FUNCTION_ARGS)
Forward declaration of the C SQL entry points.
static Query * build_inner_for_distinct_key(Query *q, Expr *key_expr, List *groupby_tes)
Build the inner GROUP-BY subquery for one AGG(DISTINCT key).
Definition provsql.c:1935
static Expr * wrap_random_variable_uuid(Node *operand, const constants_t *constants)
Wrap an expression returning random_variable in a binary-coercible cast to uuid.
Definition provsql.c:1263
int provsql_rv_mc_samples
Default sample count for analytical-evaluator MC fallbacks; 0 disables fallback (callers raise instea...
Definition provsql.c:82
static Query * process_query(const constants_t *constants, Query *q, bool **removed, bool wrap_root)
Rewrite a single SELECT query to carry provenance.
Definition provsql.c:4043
static Node * add_to_havingQual(Node *havingQual, Expr *expr)
Append expr to havingQual with an AND, creating one if needed.
Definition provsql.c:3452
static void fix_type_of_aggregation_result(const constants_t *constants, Query *q, Index rteid, List *targetList)
Retypes aggregation-result Vars in q from UUID to agg_token.
Definition provsql.c:297
static Expr * wrap_in_assume_boolean(const constants_t *constants, Expr *expr)
Wrap expr in a provsql.assume_boolean FuncExpr.
Definition provsql.c:4000
static void provsql_executor_end(QueryDesc *queryDesc)
Definition provsql.c:4650
static void process_set_operation_union(const constants_t *constants, SetOperationStmt *stmt, Query *q)
Recursively annotate a UNION tree with the provenance UUID type.
Definition provsql.c:3365
static bool having_lift_walker(Node *node, void *data)
Walker for needs_having_lift: detect any operand shape that the HAVING-lift rewriter (having_OpExpr_t...
Definition provsql.c:3122
static Node * wrap_agg_token_with_cast(FuncExpr *prov_agg, const constants_t *constants)
Wrap a provenance_aggregate FuncExpr with a cast to the original aggregate return type.
Definition provsql.c:2353
static void replace_aggregations_by_provenance_aggregate(const constants_t *constants, Query *q, List *prov_atts, semiring_operation op)
Replace every Aggref in q with a provenance-aware aggregate.
Definition provsql.c:2477
static Var * make_provenance_attribute(const constants_t *constants, Query *q, RangeTblEntry *r, Index relid, AttrNumber attid)
Build a Var node that references the provenance column of a relation.
Definition provsql.c:116
static void provsql_executor_start(QueryDesc *queryDesc, int eflags)
Definition provsql.c:4633
static void remove_provenance_attribute_setoperations(Query *q, bool *removed)
Strip the provenance column's type info from a set-operation node.
Definition provsql.c:2742
static void add_to_select(Query *q, Expr *provenance)
Append the provenance expression to q's target list.
Definition provsql.c:2515
static void inline_ctes(Query *q)
Inline all CTE references in q as subqueries.
Definition provsql.c:392
void _PG_fini(void)
Extension teardown — restores the planner and shmem hooks.
Definition provsql.c:5386
static Node * provenance_mutator(Node *node, void *ctx)
Tree-mutator that replaces provenance() calls with the actual provenance expression.
Definition provsql.c:2593
static Node * aggregation_type_mutator(Node *node, void *ctx)
Tree-mutator that retypes a specific Var to agg_token.
Definition provsql.c:242
int provsql_monte_carlo_seed
Seed for the Monte Carlo sampler; -1 means non-deterministic (std::random_device); controlled by the ...
Definition provsql.c:81
static void provsql_ProcessUtility_apply(Node *parsetree, ProvSQLCtasCapture *cap)
Apply cap to the freshly-created relation stmt->into->rel.
Definition provsql.c:4883
static List * migrate_probabilistic_quals(const constants_t *constants, Query *q)
Unified WHERE classifier – routes each top-level conjunct to the right evaluation site in a single pa...
Definition provsql.c:3759
static void cast_agg_token_in_list(ListCell *lc, insert_agg_token_casts_context *ctx)
Wrap an agg_token Var in a cast to its original type, in place.
Definition provsql.c:3889
bool provsql_cmp_probability_evaluation
Run closed-form / analytic probability evaluators for gate_cmps inside probability_evaluate (currentl...
Definition provsql.c:85
static Node * cast_agg_token_mutator(Node *node, void *ctx)
Tree-mutator that casts provenance_aggregate results back to the original aggregate return type where...
Definition provsql.c:2440
char * provsql_tool_search_path
Colon-separated directory list prepended to PATH when invoking external tools (d4,...
Definition provsql.c:80
static Query * rewrite_agg_distinct(Query *q, const constants_t *constants)
Rewrite every AGG(DISTINCT key) in q using independent subqueries.
Definition provsql.c:2113
bool provsql_interrupted
Global variable that becomes true if this particular backend received an interrupt signal.
Definition provsql.c:74
static Expr * combine_prov_atts(const constants_t *constants, List *prov_atts, semiring_operation op)
Build the per-row provenance token for an aggregate rewrite.
Definition provsql.c:798
bool provsql_boolean_provenance
Opt-in safe-query optimisation: when true, rewrites hierarchical conjunctive queries to a read-once f...
Definition provsql.c:86
static FuncExpr * rv_BoolExpr_to_provenance(BoolExpr *be, const constants_t *constants, bool negated)
Convert a Boolean combination of RV comparisons into a provenance_times / provenance_plus expression.
Definition provsql.c:1336
static void inline_ctes_in_rtable(List *rtable, List *cteList)
Inline CTE references as subqueries within a query.
Definition provsql.c:358
static Expr * add_eq_from_Quals_to_Expr(const constants_t *constants, Node *quals, Expr *result, int **columns)
Walk a join-condition or WHERE quals node and add eq gates for every equality it contains.
Definition provsql.c:755
static Query * rewrite_non_all_into_external_group_by(Query *q)
Wrap a non-ALL set operation in an outer GROUP BY query.
Definition provsql.c:2783
static PlannedStmt * provsql_planner(Query *q, int cursorOptions, ParamListInfo boundParams)
Definition provsql.c:4540
static ExecutorStart_hook_type prev_ExecutorStart
Definition provsql.c:4630
static bool provsql_active
true while ProvSQL query rewriting is enabled
Definition provsql.c:75
static bool provenance_function_in_group_by(const constants_t *constants, Query *q)
Check whether a provenance() call appears in the GROUP BY list.
Definition provsql.c:2880
static void provsql_ProcessUtility(PlannedStmt *pstmt, const char *queryString, ProcessUtilityContext context, ParamListInfo params, QueryEnvironment *queryEnv, DestReceiver *dest, char *completionTag)
Definition provsql.c:5056
static Bitmapset * remove_provenance_attributes_select(const constants_t *constants, Query *q, bool **removed)
Strip provenance UUID columns from q's SELECT list.
Definition provsql.c:577
static Expr * make_aggregation_expression(const constants_t *constants, Aggref *agg_ref, List *prov_atts, semiring_operation op)
Build the provenance expression for a single aggregate function.
Definition provsql.c:916
static Expr * add_eq_from_OpExpr_to_Expr(const constants_t *constants, OpExpr *fromOpExpr, Expr *toExpr, int **columns)
Wrap toExpr in a provenance_eq gate if fromOpExpr is an equality between two tracked columns.
Definition provsql.c:679
static FuncExpr * rv_Expr_to_provenance(Expr *expr, const constants_t *constants, bool negated)
Dispatch a WHERE sub-expression to the appropriate RV converter.
Definition provsql.c:1386
static planner_hook_type prev_planner
Previous planner hook (chained).
Definition provsql.c:92
static bool check_boolexpr_on_aggregate(BoolExpr *be, const constants_t *constants)
Check whether every leaf of a Boolean expression is a supported comparison on an aggregate result.
Definition provsql.c:3535
static void process_insert_select(const constants_t *constants, Query *q)
Propagate provenance through INSERT ... SELECT.
Definition provsql.c:4406
static bool expr_contains_rv_cmp(Node *node, const constants_t *constants)
Test whether an Expr (sub-)tree contains any RV comparison.
Definition provsql.c:1409
static Oid get_agg_token_orig_type(Var *v, insert_agg_token_casts_context *ctx)
Look up the original aggregate return type for an agg_token Var.
Definition provsql.c:3861
static void cast_agg_token_args(List *args, insert_agg_token_casts_context *ctx)
Wrap any agg_token Vars in an argument list.
Definition provsql.c:3923
static Query * build_outer_for_distinct_key(TargetEntry *orig_agg_te, Query *inner, int n_gb, const constants_t *constants)
Wrap inner in an outer query that applies the original aggregate.
Definition provsql.c:2005
static int provsql_executor_depth
PostgreSQL planner hook — entry point for provenance rewriting.
Definition provsql.c:4538
static ProcessUtility_hook_type prev_ProcessUtility
Definition provsql.c:4712
static void build_column_map(Query *q, int **columns, int *nbcols)
Build the per-RTE column-numbering map used by where-provenance.
Definition provsql.c:3600
static Node * insert_agg_token_casts_mutator(Node *node, void *data)
Insert agg_token casts for Vars used in expressions.
Definition provsql.c:3943
bool provsql_hybrid_evaluation
Run the hybrid-evaluator simplifier inside probability_evaluate; controlled by the provsql....
Definition provsql.c:84
static bool provsql_update_provenance
true when provenance tracking for DML is enabled
Definition provsql.c:77
static bool check_expr_on_aggregate(Expr *expr, const constants_t *constants)
Top-level dispatcher for supported WHERE-on-aggregate patterns.
Definition provsql.c:3561
static bool provenance_function_walker(Node *node, void *data)
Tree walker that returns true if any provenance() call is found.
Definition provsql.c:2855
semiring_operation
Semiring operation used to combine provenance tokens.
Definition provsql.c:646
@ SR_PLUS
Semiring addition (UNION, SELECT DISTINCT).
Definition provsql.c:647
@ SR_TIMES
Semiring multiplication (JOIN, Cartesian product).
Definition provsql.c:649
@ SR_MONUS
Semiring monus / set difference (EXCEPT).
Definition provsql.c:648
static bool needs_having_lift(Node *havingQual, const constants_t *constants)
Return true if havingQual contains anything the HAVING-lift path needs to handle (an agg_token Var or...
Definition provsql.c:3155
static Node * aggregation_mutator(Node *node, void *ctx)
Tree-mutator that replaces Aggrefs with provenance-aware aggregates.
Definition provsql.c:2331
static bool expr_contains_aggref_walker(Node *node, void *context)
expression_tree_walker predicate: returns true on the first Aggref it encounters.
Definition provsql.c:2578
static ExecutorEnd_hook_type prev_ExecutorEnd
Definition provsql.c:4631
bool provsql_aggtoken_text_as_uuid
When true, agg_token::text emits the underlying provenance UUID instead of "value (*)".
Definition provsql.c:79
static Expr * make_provenance_expression(const constants_t *constants, Query *q, List *prov_atts, bool aggregation, bool group_by_rewrite, semiring_operation op, int **columns, int nbcols, bool wrap_assumed_boolean)
Build the combined provenance expression to be added to the SELECT list.
Definition provsql.c:1510
static FuncExpr * having_OpExpr_to_provenance_cmp(OpExpr *opExpr, const constants_t *constants, bool negated)
Convert a comparison OpExpr on aggregate results into a provenance_cmp gate expression.
Definition provsql.c:1050
static void add_select_non_zero(const constants_t *constants, Query *q, Expr *provsql)
Add a WHERE condition filtering out zero-provenance tuples.
Definition provsql.c:3415
static void reduce_varattno_by_offset(List *targetList, Index varno, int *offset)
Adjust Var attribute numbers in targetList after columns are removed.
Definition provsql.c:196
qual_class
Categorisation of a top-level WHERE conjunct.
Definition provsql.c:3653
@ QUAL_MIXED_RV_DET
random_variable mixed with non-RV leaves; error
Definition provsql.c:3658
@ QUAL_PURE_AGG
pure agg_token expression; route to HAVING
Definition provsql.c:3655
@ QUAL_DETERMINISTIC
no probabilistic value; stays in WHERE
Definition provsql.c:3654
@ QUAL_MIXED_AGG_DET
agg_token mixed with non-agg leaves; error
Definition provsql.c:3657
@ QUAL_MIXED_AGG_RV
agg_token and random_variable in the same expr; error
Definition provsql.c:3659
@ QUAL_PURE_RV
pure random_variable expression; lift to provenance
Definition provsql.c:3656
static FuncExpr * rv_OpExpr_to_provenance_cmp(OpExpr *opExpr, const constants_t *constants, bool negated)
Convert a single RV-comparison OpExpr into a provenance_cmp() FuncExpr returning UUID.
Definition provsql.c:1297
static Expr * make_rv_aggregate_expression(const constants_t *constants, Aggref *agg_ref, List *prov_atts, semiring_operation op)
Inline rewrite of an RV-returning aggregate into the same aggregate over provenance-wrapped per-row a...
Definition provsql.c:855
static void error_for_mixed_qual(qual_class c)
Raise the user-facing error appropriate to a mixed c.
Definition provsql.c:3705
static bool has_provenance(const constants_t *constants, Query *q)
Return true if q involves any provenance-bearing relation or contains an explicit provenance() call.
Definition provsql.c:3068
#define provsql_error(fmt,...)
Report a fatal ProvSQL error and abort the current transaction.
#define provsql_warning(fmt,...)
Emit a ProvSQL warning message (execution continues).
#define provsql_notice(fmt,...)
Emit a ProvSQL informational notice (execution continues).
void RegisterProvSQLMMapWorker(void)
Register the ProvSQL mmap background worker with PostgreSQL.
Background worker and IPC primitives for mmap-backed circuit storage.
void provsql_shmem_request(void)
Request shared memory from PostgreSQL (PG ≥ 15).
shmem_startup_hook_type prev_shmem_startup
Saved pointer to the previous shmem_startup_hook, for chaining.
void provsql_shmem_startup(void)
Initialise the ProvSQL shared-memory segment.
Shared-memory segment and inter-process pipe management.
shmem_request_hook_type prev_shmem_request
Saved pointer to the previous shmem_request_hook (PG ≥ 15), for chaining.
Oid find_equality_operator(Oid ltypeId, Oid rtypeId)
Find the equality operator OID for two given types.
bool provsql_lookup_ancestry(Oid relid, uint16 *ancestor_n_out, Oid *ancestors_out)
Look up the base-ancestor set of a tracked relation.
bool provsql_lookup_table_info(Oid relid, ProvenanceTableInfo *out)
Look up per-table provenance metadata with a backend-local cache.
constants_t get_constants(bool failure_if_not_possible)
Retrieve the cached OID constants for the current database.
Core types, constants, and utilities shared across ProvSQL.
#define PROVSQL_COLUMN_NAME
Canonical name of the per-row provenance column installed by add_provenance / repair_key.
Query * try_safe_query_rewrite(const constants_t *constants, Query *q)
Top-level entry point for the safe-query rewrite.
Public surface of the safe-query (hierarchical-CQ) rewriter.
void strip_group_rte_pg18(Query *q)
PG 18 helper: strip the synthetic RTE_GROUP entry from q in place, resolving every grouped Var back t...
Result of provsql_classify_query.
provsql_table_kind kind
State captured by the pre-execution pass for the post-execution one.
Definition provsql.c:4715
provsql_table_kind inherited_kind
Definition provsql.c:4718
bool fire
true when the post-pass should run
Definition provsql.c:4716
uint16 source_block_key_n
Definition provsql.c:4722
Oid ancestors[PROVSQL_TABLE_INFO_MAX_ANCESTORS]
Definition provsql.c:4720
AttrNumber source_block_key[PROVSQL_TABLE_INFO_MAX_BLOCK_KEY]
Definition provsql.c:4723
Query * inner_query
cloned for safety; freed by pfree on completion
Definition provsql.c:4717
Oid source_relid
Single source whose block_key we want to align (BID only).
Definition provsql.c:4721
Per-relation metadata for the safe-query optimisation.
AttrNumber block_key[PROVSQL_TABLE_INFO_MAX_BLOCK_KEY]
Block-key column numbers.
uint16_t block_key_n
Number of valid entries in block_key.
uint8_t kind
One of provsql_table_kind.
Context for the aggregation_mutator tree walker.
Definition provsql.c:2318
semiring_operation op
Semiring operation for combining tokens.
Definition provsql.c:2320
const constants_t * constants
Extension OID cache.
Definition provsql.c:2321
List * prov_atts
List of provenance Var nodes.
Definition provsql.c:2319
Context for the aggregation_type_mutator tree walker.
Definition provsql.c:208
const constants_t * constants
Extension OID cache.
Definition provsql.c:211
Index varattno
Attribute number of the aggregate column.
Definition provsql.c:210
Index varno
Range-table entry index of the aggregate var.
Definition provsql.c:209
Structure to store the value of various constants.
Oid OID_FUNCTION_PROVENANCE_EQ
OID of the provenance_eq FUNCTION.
Oid OID_FUNCTION_PROVENANCE_AGGREGATE
OID of the provenance_aggregate FUNCTION.
Oid OID_FUNCTION_PROVENANCE_SEMIMOD
OID of the provenance_semimod FUNCTION.
Oid OID_FUNCTION_PROVENANCE
OID of the provenance FUNCTION.
Oid OID_FUNCTION_AGG_TOKEN_UUID
OID of the agg_token_uuid FUNCTION.
Oid OID_FUNCTION_RV_AGGREGATE_SEMIMOD
OID of rv_aggregate_semimod helper (uuid, rv -> rv) used to wrap each per-row argument of an RV-retur...
Oid OID_FUNCTION_GATE_ZERO
OID of the provenance_zero FUNCTION.
Oid OID_FUNCTION_PROVENANCE_PROJECT
OID of the provenance_project FUNCTION.
Oid OID_TYPE_AGG_TOKEN
OID of the agg_token TYPE.
Oid OID_FUNCTION_ARRAY_AGG
OID of the array_agg FUNCTION.
Oid OID_TYPE_INT
OID of the INT TYPE.
Oid OID_FUNCTION_PROVENANCE_PLUS
OID of the provenance_plus FUNCTION.
Oid OID_OPERATOR_NOT_EQUAL_UUID
OID of the <> operator on UUIDs FUNCTION.
Oid OID_TYPE_UUID
OID of the uuid TYPE.
bool ok
true if constants were loaded
Oid OID_TYPE_INT_ARRAY
OID of the INT[] TYPE.
Oid OID_FUNCTION_PROVENANCE_DELTA
OID of the provenance_delta FUNCTION.
Oid OID_FUNCTION_ASSUME_BOOLEAN
OID of provsql.assume_boolean(uuid)->uuid.
Oid OID_FUNCTION_PROVENANCE_TIMES
OID of the provenance_times FUNCTION.
Oid OID_FUNCTION_PROVENANCE_MONUS
OID of the provenance_monus FUNCTION.
Oid OID_FUNCTION_NOT_EQUAL_UUID
OID of the = operator on UUIDs FUNCTION.
Oid OID_FUNCTION_GATE_ONE
OID of the provenance_one FUNCTION.
Oid OID_TYPE_UUID_ARRAY
OID of the uuid[] TYPE.
Oid OID_TYPE_RANDOM_VARIABLE
OID of the random_variable TYPE.
Oid OID_FUNCTION_PROVENANCE_CMP
OID of the provenance_cmp FUNCTION.
Oid OID_FUNCTION_RV_CMP[6]
OIDs of the random_variable_{eq,ne,le,lt,ge,gt} comparison procedure functions, indexed by the Compar...
Context for the insert_agg_token_casts_mutator.
Definition provsql.c:3849
const constants_t * constants
Extension OID cache.
Definition provsql.c:3851
Query * query
Outer query (to look up subquery RTEs).
Definition provsql.c:3850
Context for the provenance_mutator tree walker.
Definition provsql.c:2562
bool provsql_has_aggref
true when provsql contains an Aggref (set once by replace_provenance_function_by_expression)....
Definition provsql.c:2565
bool inside_aggref
true while descending the argument tree of an Aggref node.
Definition provsql.c:2566
const constants_t * constants
Extension OID cache.
Definition provsql.c:2564
Expr * provsql
Provenance expression to substitute for provenance() calls.
Definition provsql.c:2563
Context for the reduce_varattno_mutator tree walker.
Definition provsql.c:157
Index varno
Range-table entry whose attribute numbers are being adjusted.
Definition provsql.c:158
int * offset
Per-attribute cumulative shift to apply.
Definition provsql.c:159