ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
classify_query.c
Go to the documentation of this file.
1/**
2 * @file classify_query.c
3 * @brief Query-time TID / BID / OPAQUE classifier.
4 *
5 * Invoked by @c provsql_planner on the top-level @c Query when the
6 * @c provsql.classify_top_level GUC is on. Emits a @c NOTICE carrying
7 * the certified kind and the set of provenance-tracked base relations
8 * the query touches.
9 *
10 * Scope :
11 *
12 * - Single-source classification : a flat @c fromlist of
13 * @c RangeTblRefs, with no kind-altering features (@c SubLinks,
14 * modifying @c CTEs, @c cteList, @c DISTINCT, @c GROUP BY,
15 * @c HAVING, aggregates, window functions, set-returning
16 * functions in the target list). Either zero or one
17 * provenance-tracked base relations are reached either directly
18 * (@c RTE_RELATION) or through any depth of subqueries
19 * (@c RTE_SUBQUERY -- view bodies after PG rewriting and inline
20 * @c FROM subqueries). The recorded kind of the sole tracked
21 * source (TID / BID / OPAQUE) is preserved verbatim ; zero
22 * tracked sources is trivially TID.
23 * - @c UNION @c ALL specialisation : a fully-@c UNION-ALL tree of
24 * leg subqueries each of which classifies independently as TID
25 * over a base-relid set that is disjoint from every other leg's.
26 * The union is then TID with the cumulative source list.
27 * Anything that doesn't fit (@c INTERSECT, @c EXCEPT,
28 * @c UNION @c DISTINCT, mixed kinds, overlapping leg sources)
29 * falls back to OPAQUE.
30 *
31 * Everything else is reported as OPAQUE. Independent-TID join
32 * inference, BID block-key preservation under projection and
33 * @c GROUP @c BY, and the per-relation base-ancestor registry the
34 * disjointness check consults all live in this same file ; see
35 * the helpers below.
36 */
37#include "postgres.h"
38
39#include "lib/stringinfo.h"
40#include "nodes/bitmapset.h"
41#include "nodes/nodes.h"
42#include "nodes/parsenodes.h"
43#include "nodes/pg_list.h"
44#if PG_VERSION_NUM >= 120000
45#include "optimizer/optimizer.h" /* get_sortgroupclause_tle */
46#else
47#include "optimizer/tlist.h" /* get_sortgroupclause_tle (PG <12) */
48#endif
49#include "utils/builtins.h"
50#include "utils/lsyscache.h"
51
52#include "classify_query.h"
53#include "provsql_utils.h"
54
55/** @brief Backing storage for the @c provsql.classify_top_level GUC. */
57
58/** @brief Map a @c provsql_table_kind to its uppercase user-facing label. */
59static const char *kind_label(provsql_table_kind k) {
60 switch (k) {
61 case PROVSQL_TABLE_TID: return "TID";
62 case PROVSQL_TABLE_BID: return "BID";
63 case PROVSQL_TABLE_OPAQUE: return "OPAQUE";
64 }
65 return "?";
66}
67
68/**
69 * @brief Decide whether a @c jointree fromlist entry has a shape the
70 * classifier can certify : a plain @c RangeTblRef, or a
71 * @c JoinExpr with @c jointype @c == @c JOIN_INNER whose
72 * @c larg and @c rarg recursively satisfy the same predicate.
73 *
74 * ANSI-syntax inner joins (@c INNER @c JOIN, @c CROSS @c JOIN; PG
75 * normalises both to @c JOIN_INNER) preserve TID per-row independence
76 * because every output row corresponds to exactly one pair of source
77 * rows -- the row's provenance is the AND of the two tokens. Outer
78 * joins (@c LEFT / @c RIGHT / @c FULL) introduce NULL-padding rows
79 * whose provenance is the negation of the inner-match disjunction, so
80 * they break the per-row TID property and stay OPAQUE. Semi / anti
81 * joins are also rejected (the planner uses @c JOIN_SEMI / @c JOIN_ANTI
82 * for sublink-driven joins, which our @c hasSubLinks shape gate has
83 * already filtered out upstream, but the explicit check here keeps
84 * the predicate self-contained).
85 */
86static bool classify_fromlist_shape_ok(Node *n) {
87 if (n == NULL)
88 return false;
89 if (IsA(n, RangeTblRef))
90 return true;
91 if (IsA(n, JoinExpr)) {
92 JoinExpr *je = (JoinExpr *) n;
93 if (je->jointype != JOIN_INNER)
94 return false;
95 return classify_fromlist_shape_ok(je->larg)
96 && classify_fromlist_shape_ok(je->rarg);
97 }
98 return false;
99}
100
101/**
102 * @brief Recursive walker shared by the top-level entry point and the
103 * @c RTE_SUBQUERY descent.
104 *
105 * The shape gate, source enumeration, and recursion live here so the
106 * outer entry point can decide TID / BID / OPAQUE from the cumulative
107 * @c n_meta / @c sole_relid pair after the whole tree has been
108 * walked. Tracked @c RTE_RELATION entries reachable through any
109 * depth of @c RTE_SUBQUERY (view bodies after PG rewriting, inline
110 * @c FROM-clause subqueries) contribute to the accumulator.
111 *
112 * The recursion is stack-bounded by the SQL parser's own nesting
113 * limit ; no explicit depth cap is needed at this layer.
114 */
115static void classify_walk(Query *q,
117 bool *shape_ok,
118 int *n_meta,
119 Oid *sole_relid) {
120 ListCell *lc;
121
122 if (q == NULL || q->commandType != CMD_SELECT) {
123 *shape_ok = false;
124 return;
125 }
126
127 /* Shape gate at this level. Anything that turns row lineage
128 * into a composite (aggregates, GROUP BY, HAVING, DISTINCT,
129 * window functions, SRFs in the target list) breaks the
130 * per-row independent-atom property TID demands, so we refuse
131 * to certify. Hidden subqueries, modifying CTEs, named CTEs,
132 * and set operations are conservative rejects from the original
133 * scope (set operations have a dedicated UNION ALL path higher
134 * up the dispatcher). A FROM-less @c SELECT (e.g.
135 * @c SELECT @c 1) keeps the gate open because there are no
136 * sub-structures to inspect; it ends up trivially TID at the
137 * outer level. */
138 if (q->hasSubLinks
139 || q->hasModifyingCTE
140 || q->cteList != NIL
141 || q->setOperations != NULL
142 || q->distinctClause != NIL
143 || q->groupClause != NIL
144 || q->groupingSets != NIL
145 || q->havingQual != NULL
146 || q->hasAggs
147 || q->hasWindowFuncs
148 || q->hasTargetSRFs)
149 *shape_ok = false;
150
151 if (*shape_ok && q->jointree != NULL && q->jointree->fromlist != NIL) {
152 foreach (lc, q->jointree->fromlist) {
153 if (!classify_fromlist_shape_ok((Node *) lfirst(lc))) {
154 *shape_ok = false;
155 break;
156 }
157 }
158 }
159
160 /* Walk the range table. RTE_RELATION entries with metadata are
161 * collected as sources; @c RTE_SUBQUERY recurses (view bodies and
162 * inline subqueries) so the underlying base relations join the
163 * accumulator. @c RTE_JOIN entries (one per @c JoinExpr in the
164 * fromlist) are synthetic union-aliases over the join's combined
165 * column list ; they carry no source on their own and pass through.
166 * The fromlist shape gate above already constrains them to
167 * @c JOIN_INNER, so seeing one here does not change the
168 * per-row independence story. Any other @c rtekind (@c RTE_VALUES,
169 * @c RTE_CTE, @c RTE_FUNCTION, the PG 18 synthetic @c RTE_GROUP,
170 * ...) trips the shape gate so we conservatively report OPAQUE.
171 * @c RTE_GROUP appears alongside the user RTEs when @c q->groupClause
172 * is non-empty -- our shape gate already rejects @c groupClause
173 * upstream, so by the time control reaches this loop on a GROUP BY
174 * query @c *shape_ok is already false and the catch-all here is
175 * just re-asserting it without enumerating @c RTE_GROUP as a
176 * source. Treating it as a generic non-source rtekind keeps the
177 * file source-compatible across PG 12-18+ without a version
178 * guard. */
179 foreach (lc, q->rtable) {
180 RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
182
183 if (rte->rtekind == RTE_RELATION) {
184 if (provsql_lookup_table_info(rte->relid, &info)) {
185 out->source_relids = lappend_oid(out->source_relids, rte->relid);
186 *sole_relid = rte->relid;
187 (*n_meta)++;
188 }
189 } else if (rte->rtekind == RTE_SUBQUERY) {
190 /* Descend. The same shape gate is applied to the inner
191 * @c Query : a @c SubLink or set operation inside the
192 * subquery propagates opacity to the outer level, while the
193 * visible inner @c RTE_RELATION sources are still added to
194 * the accumulator for diagnostic purposes. */
195 classify_walk(rte->subquery, out, shape_ok, n_meta, sole_relid);
196 } else if (rte->rtekind == RTE_JOIN) {
197 /* Synthetic RTE for a JoinExpr; the underlying source RTEs
198 * appear separately in the same rtable. No-op here. */
199 } else {
200 *shape_ok = false;
201 }
202 }
203}
204
205/**
206 * @brief Walk a @c SetOperationStmt tree, collecting each leaf
207 * leg's @c Query body into @c legs.
208 *
209 * The tree is a binary structure : interior @c SetOperationStmt
210 * nodes carry the operator and an @c all flag, leaf @c RangeTblRefs
211 * point at @c RTE_SUBQUERY entries in @c parent->rtable whose
212 * @c subquery field is the leg's parsed @c Query. Returns false
213 * if any interior node is not a @c UNION @c ALL or any leaf is not
214 * a subquery RTE, so that the dispatcher can fall back to OPAQUE
215 * (only fully-UNION-ALL trees qualify for the TID promotion).
216 */
217static bool collect_union_all_legs(Node *node, Query *parent,
218 List **legs) {
219 if (node == NULL)
220 return false;
221 if (IsA(node, RangeTblRef)) {
222 int rtindex = ((RangeTblRef *) node)->rtindex;
223 RangeTblEntry *rte;
224 if (rtindex < 1 || rtindex > list_length(parent->rtable))
225 return false;
226 rte = (RangeTblEntry *) list_nth(parent->rtable, rtindex - 1);
227 if (rte->rtekind != RTE_SUBQUERY || rte->subquery == NULL)
228 return false;
229 *legs = lappend(*legs, rte->subquery);
230 return true;
231 }
232 if (IsA(node, SetOperationStmt)) {
233 SetOperationStmt *s = (SetOperationStmt *) node;
234 if (s->op != SETOP_UNION || !s->all)
235 return false;
236 return collect_union_all_legs(s->larg, parent, legs)
237 && collect_union_all_legs(s->rarg, parent, legs);
238 }
239 return false;
240}
241
242/**
243 * @brief Promote a fully-UNION-ALL @c Query to TID when each leg
244 * classifies as TID and the leg source-relid sets are
245 * pairwise disjoint.
246 *
247 * Returns true and populates @c out with TID + the cumulative
248 * source list on success ; returns false to let the dispatcher
249 * fall through to the conservative OPAQUE path on any failure
250 * (non-UNION-ALL operator, a leg that classifies as BID/OPAQUE,
251 * or overlapping leg sources -- the gate-level atoms of a relid
252 * that appears in two legs are not disjoint, so the multiset
253 * union no longer satisfies the TID property).
254 *
255 * Pairwise disjointness is checked at the relid level only. A
256 * future correlation registry will refine this by also rejecting
257 * legs whose base ancestors overlap (e.g. two views sharing the
258 * same underlying TID table), which the syntactic check cannot
259 * detect.
260 *
261 * BID legs are deliberately not promoted, even when every leg is
262 * BID with the same block-key column projected at the same target-
263 * list position. The UNION ALL output's "rows with the same
264 * block-key value" set spans both legs, but a row from leg A and a
265 * row from leg B sharing a block-key value are independent (the
266 * legs are different relations), not mutually exclusive. Calling
267 * the result BID under that column would falsely advertise an
268 * invariant the rows don't satisfy. Recovering BID-ness would
269 * require either certifying disjoint block-key values between
270 * legs (not knowable from the query text) or emitting a synthetic
271 * composite block key @c (leg_id, k) in the output and recording
272 * it in @c provsql_table_info ; both paths are documented as
273 * conservative-by-design in the safe-query follow-ups.
274 */
275static bool try_classify_union_all(Query *q,
277 List *legs = NIL;
278 List *seen = NIL;
279 ListCell *lc;
280
281 if (q->setOperations == NULL || !IsA(q->setOperations, SetOperationStmt))
282 return false;
283
284 if (!collect_union_all_legs(q->setOperations, q, &legs)) {
285 list_free(legs);
286 return false;
287 }
288
289 foreach (lc, legs) {
290 Query *leg_query = (Query *) lfirst(lc);
292 ListCell *src_lc;
293
294 provsql_classify_query(leg_query, &leg);
295 if (leg.kind != PROVSQL_TABLE_TID) {
296 list_free(leg.source_relids);
297 list_free(seen);
298 list_free(legs);
299 return false;
300 }
301 foreach (src_lc, leg.source_relids) {
302 Oid relid = lfirst_oid(src_lc);
303 if (list_member_oid(seen, relid)) {
304 list_free(leg.source_relids);
305 list_free(seen);
306 list_free(legs);
307 return false;
308 }
309 seen = lappend_oid(seen, relid);
310 }
311 list_free(leg.source_relids);
312 }
313 list_free(legs);
314
315 out->kind = PROVSQL_TABLE_TID;
316 out->source_relids = seen;
317 return true;
318}
319
320/**
321 * @brief Conservative multi-source promotion: when every tracked
322 * source in @p out->source_relids is TID and the registered
323 * ancestor sets are pairwise disjoint, promote the
324 * classification to TID.
325 *
326 * Mirrors the disjointness check the safe-query rewriter runs in
327 * @c is_safe_query_candidate, just at the classifier layer so a
328 * multi-source query no longer collapses to OPAQUE before the
329 * rewriter even sees it. The hierarchical-CQ structure is NOT
330 * inspected here -- the rewriter runs the full check downstream,
331 * and the classifier's job is only to certify the per-row
332 * independence the user-visible NOTICE pill advertises.
333 *
334 * Returns @c true and sets @c out->kind on success. On failure
335 * (any source non-TID, no registry entry, or any pair of ancestor
336 * sets overlapping), returns @c false and leaves @p out unchanged
337 * so the caller falls through to OPAQUE.
338 */
340 ListCell *lc;
341 int i, j, n;
342 uint16 *anc_n;
344
345 n = list_length(out->source_relids);
346 if (n < 2)
347 return false;
348
349 anc_n = palloc0(n * sizeof(uint16));
350 anc = palloc(n * sizeof(*anc));
351
352 i = 0;
353 foreach (lc, out->source_relids) {
354 Oid relid = lfirst_oid(lc);
356 if (!provsql_lookup_table_info(relid, &info)
357 || info.kind != PROVSQL_TABLE_TID) {
358 pfree(anc);
359 pfree(anc_n);
360 return false;
361 }
362 if (!provsql_lookup_ancestry(relid, &anc_n[i], anc[i])) {
363 /* Defensive : add_provenance / repair_key seed {self} eagerly,
364 * so a tracked relation should always have a non-empty
365 * registry entry. If somehow missing, fall back to {self}
366 * (matches the safe-query rewriter's same fallback). */
367 anc[i][0] = relid;
368 anc_n[i] = 1;
369 }
370 i++;
371 }
372
373 for (i = 0; i < n; i++)
374 for (j = i + 1; j < n; j++) {
375 uint16 a, b;
376 for (a = 0; a < anc_n[i]; a++)
377 for (b = 0; b < anc_n[j]; b++)
378 if (anc[i][a] == anc[j][b]) {
379 pfree(anc);
380 pfree(anc_n);
381 return false;
382 }
383 }
384
385 pfree(anc);
386 pfree(anc_n);
387 out->kind = PROVSQL_TABLE_TID;
388 return true;
389}
390
391/**
392 * @brief Resolve a base-level (@p varno, @p attno) pair in @p q
393 * transitively through @c RTE_SUBQUERY layers until reaching
394 * the underlying @c RTE_RELATION column. Returns @c true on
395 * success and writes the base relid / base attno to
396 * @p out_relid / @p out_attno. Returns @c false when any
397 * intermediate TLE is not a plain @c Var (possibly through
398 * @c RelabelType wrappers), when an outer-scope reference is
399 * hit, or when the chain ends on a non-relation rtekind.
400 */
401static bool resolve_var_to_base(Query *q, Index varno, AttrNumber attno,
402 Oid *out_relid, AttrNumber *out_attno) {
403 RangeTblEntry *rte;
404 if (q == NULL || varno < 1
405 || (int) varno > list_length(q->rtable))
406 return false;
407 rte = (RangeTblEntry *) list_nth(q->rtable, varno - 1);
408 if (rte->rtekind == RTE_RELATION) {
409 *out_relid = rte->relid;
410 *out_attno = attno;
411 return true;
412 }
413 if (rte->rtekind == RTE_SUBQUERY && rte->subquery != NULL) {
414 Query *sub = rte->subquery;
415 TargetEntry *te;
416 Node *e;
417 Var *v;
418 ListCell *lc;
419 /* Match by resno : a TLE may carry a Var resjunk-tagged or
420 * reordered, so scan rather than @c list_nth blindly. */
421 te = NULL;
422 foreach (lc, sub->targetList) {
423 TargetEntry *t = (TargetEntry *) lfirst(lc);
424 if (t->resno == attno && !t->resjunk) {
425 te = t;
426 break;
427 }
428 }
429 if (te == NULL)
430 return false;
431 e = (Node *) te->expr;
432 while (e != NULL && IsA(e, RelabelType))
433 e = (Node *) ((RelabelType *) e)->arg;
434 if (e == NULL || !IsA(e, Var))
435 return false;
436 v = (Var *) e;
437 if (v->varlevelsup != 0)
438 return false;
439 return resolve_var_to_base(sub, v->varno, v->varattno,
440 out_relid, out_attno);
441 }
442 return false;
443}
444
445/**
446 * @brief Decide whether every block-key column of @p info survives
447 * in @p q's target list -- resolved transitively through
448 * @c RTE_SUBQUERY descents so the same check works on
449 * @c SELECT @c k @c FROM @c bid_t and on
450 * @c SELECT @c k @c FROM @c (SELECT @c k @c FROM @c bid_t).
451 * Renamed projections (@c SELECT @c k @c AS @c b ...) still
452 * count as preserving -- the match is on the underlying
453 * @c Var, not the output column's name. @c resjunk entries
454 * in the outer @c targetList are ignored.
455 */
456static bool bid_block_key_preserved(Query *q, Oid source_relid,
457 const ProvenanceTableInfo *info) {
458 for (uint16 i = 0; i < info->block_key_n; ++i) {
459 AttrNumber bk = info->block_key[i];
460 bool found = false;
461 ListCell *lc;
462 foreach (lc, q->targetList) {
463 TargetEntry *te = (TargetEntry *) lfirst(lc);
464 Node *e = (Node *) te->expr;
465 Var *v;
466 Oid base_relid;
467 AttrNumber base_attno;
468 if (te->resjunk)
469 continue;
470 while (e != NULL && IsA(e, RelabelType))
471 e = (Node *) ((RelabelType *) e)->arg;
472 if (e == NULL || !IsA(e, Var))
473 continue;
474 v = (Var *) e;
475 if (v->varlevelsup != 0)
476 continue;
477 if (!resolve_var_to_base(q, v->varno, v->varattno,
478 &base_relid, &base_attno))
479 continue;
480 if (base_relid == source_relid && base_attno == bk) {
481 found = true;
482 break;
483 }
484 }
485 if (!found)
486 return false;
487 }
488 return true;
489}
490
491/**
492 * @brief PG 18+ helper: when @p q has a synthetic @c RTE_GROUP
493 * entry (set @c parseCheckAggregates() appends it for every
494 * @c GROUP @c BY query), Vars in @c groupClause's TLE
495 * expressions point at the @c RTE_GROUP rather than the
496 * underlying source. Resolve them through the @c RTE_GROUP's
497 * @c groupexprs list so the BID-block-key check below sees
498 * the source Var. No-op on PG &lt; 18 and on queries without
499 * @c hasGroupRTE.
500 */
501static Node *resolve_through_group_rte(Query *q, Node *e) {
502#if PG_VERSION_NUM >= 180000
503 ListCell *lc;
504 Index group_rtindex = 0;
505 Index idx = 1;
506 List *groupexprs = NIL;
507 Var *v;
508
509 if (e == NULL || !q->hasGroupRTE)
510 return e;
511 foreach (lc, q->rtable) {
512 RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
513 if (rte->rtekind == RTE_GROUP) {
514 group_rtindex = idx;
515 groupexprs = rte->groupexprs;
516 break;
517 }
518 idx++;
519 }
520 if (group_rtindex == 0)
521 return e;
522
523 while (e != NULL && IsA(e, RelabelType))
524 e = (Node *) ((RelabelType *) e)->arg;
525 if (e == NULL || !IsA(e, Var))
526 return e;
527 v = (Var *) e;
528 if (v->varlevelsup != 0 || v->varno != group_rtindex)
529 return e;
530 if (v->varattno < 1 || v->varattno > list_length(groupexprs))
531 return e;
532 return (Node *) list_nth(groupexprs, v->varattno - 1);
533#else
534 (void) q;
535 return e;
536#endif
537}
538
539/**
540 * @brief Pre-dispatch special case for @c GROUP @c BY on a single
541 * BID source's block-key columns.
542 *
543 * The generic shape gate rejects @c groupClause @c != @c NIL up
544 * front, but @c SELECT @c k @c FROM @c bid_t @c GROUP @c BY @c k
545 * (and the multi-column-key generalisation) has a well-defined
546 * per-row provenance : each output row's @c block_key value
547 * uniquely identifies one BID block, and the OR over that block's
548 * mulinput slots reduces to the block's key token (an independent
549 * @c gate_input). So the output is per-row independent -- TID --
550 * with the cumulative source list narrowed to the single BID
551 * source.
552 *
553 * Conservative: requires no aggregates / window functions /
554 * sublinks / SRFs / CTEs / set operations / HAVING / DISTINCT /
555 * sortClause-with-side-effects, no @c LIMIT / @c OFFSET, a flat
556 * fromlist of exactly one @c RangeTblRef pointing at a BID
557 * @c RTE_RELATION, and a @c groupClause whose resolved Vars match
558 * the source's block-key set exactly (no extra columns, no
559 * missing ones). When all met, returns @c true with
560 * @p out populated. Any failure leaves @p out untouched ; the
561 * caller proceeds to the generic dispatcher path.
562 */
565 RangeTblRef *rtr;
566 RangeTblEntry *rte;
568 Bitmapset *resolved = NULL;
569 ListCell *lc;
570
571 if (q->groupClause == NIL)
572 return false;
573 if (q->hasAggs || q->hasWindowFuncs || q->hasTargetSRFs
574 || q->hasSubLinks || q->hasModifyingCTE || q->hasDistinctOn)
575 return false;
576 if (q->cteList != NIL || q->groupingSets != NIL
577 || q->havingQual != NULL || q->setOperations != NULL
578 || q->distinctClause != NIL)
579 return false;
580 if (q->jointree == NULL
581 || list_length(q->jointree->fromlist) != 1)
582 return false;
583 if (!IsA(linitial(q->jointree->fromlist), RangeTblRef))
584 return false;
585 rtr = (RangeTblRef *) linitial(q->jointree->fromlist);
586 if (rtr->rtindex < 1 || rtr->rtindex > list_length(q->rtable))
587 return false;
588 rte = (RangeTblEntry *) list_nth(q->rtable, rtr->rtindex - 1);
589 if (rte->rtekind != RTE_RELATION)
590 return false;
591 if (!provsql_lookup_table_info(rte->relid, &info))
592 return false;
593 if (info.kind != PROVSQL_TABLE_BID)
594 return false;
595 if (info.block_key_n == 0)
596 return false; /* whole-table BID : "GROUP BY {}" doesn't exist */
597
598 foreach (lc, q->groupClause) {
599 SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
600 TargetEntry *te;
601 Node *e;
602 Var *v;
603 bool attno_in_block_key = false;
604 uint16 i;
605 te = get_sortgroupclause_tle(sgc, q->targetList);
606 if (te == NULL) { bms_free(resolved); return false; }
607 e = (Node *) te->expr;
608 /* On PG 18+, grouped Vars in the targetList point at the
609 * synthetic @c RTE_GROUP entry ; resolve through its
610 * @c groupexprs list back to the source-relation Var. */
612 while (e != NULL && IsA(e, RelabelType))
613 e = (Node *) ((RelabelType *) e)->arg;
614 if (e == NULL || !IsA(e, Var)) { bms_free(resolved); return false; }
615 v = (Var *) e;
616 if (v->varlevelsup != 0 || v->varno != rtr->rtindex) {
617 bms_free(resolved); return false;
618 }
619 for (i = 0; i < info.block_key_n; ++i)
620 if (info.block_key[i] == v->varattno) {
621 attno_in_block_key = true; break;
622 }
623 if (!attno_in_block_key) { bms_free(resolved); return false; }
624 resolved = bms_add_member(resolved, v->varattno);
625 }
626 /* Each block-key column must appear exactly once in groupClause. */
627 for (uint16 i = 0; i < info.block_key_n; ++i)
628 if (!bms_is_member(info.block_key[i], resolved)) {
629 bms_free(resolved); return false;
630 }
631 bms_free(resolved);
632
633 out->kind = PROVSQL_TABLE_TID;
634 out->source_relids = list_make1_oid(rte->relid);
635 return true;
636}
637
639 bool shape_ok = true;
640 int n_meta = 0;
641 Oid sole_relid = InvalidOid;
642
644 out->source_relids = NIL;
645
646 if (q == NULL || q->commandType != CMD_SELECT)
647 return;
648
649 /* UNION ALL specialisation : a fully-UNION-ALL tree of subquery
650 * legs each TID over a relid set disjoint from the other legs
651 * promotes to TID with the cumulative source list. Anything
652 * else (INTERSECT, EXCEPT, UNION DISTINCT, mixed kinds,
653 * overlapping leg sources) falls through to @c classify_walk,
654 * which trips the shape gate on @c q->setOperations != NULL and
655 * reports OPAQUE while still enumerating visible sources for
656 * diagnostics. */
657 if (q->setOperations != NULL && try_classify_union_all(q, out))
658 return;
659
660 /* GROUP BY on a single BID source's block-key columns reduces
661 * the output to one row per block ; each row's provenance
662 * collapses to the block's key token (an independent input
663 * gate), so the result is TID. Handled as a pre-dispatch
664 * special case because the generic shape gate refuses
665 * @c groupClause @c != @c NIL up front. */
667 return;
668
669 classify_walk(q, out, &shape_ok, &n_meta, &sole_relid);
670
671 if (!shape_ok) {
672 /* Conservative : when we cannot fully see the query, we cannot
673 * certify TID-ness even if the visible RTEs carry no metadata --
674 * a hidden subquery might pull in correlated rows. */
676 } else if (n_meta == 0) {
677 /* Fully visible and no provenance-tracked source : the result is
678 * deterministic, hence trivially TID. */
679 out->kind = PROVSQL_TABLE_TID;
680 } else if (n_meta == 1) {
682 if (provsql_lookup_table_info(sole_relid, &info)) {
683 if (info.kind == PROVSQL_TABLE_BID) {
684 /* BID : the output is BID iff every block-key column of the
685 * source survives in the outer target list (matched by Var
686 * resolution through any @c RTE_SUBQUERY descent, not by
687 * output column name). Otherwise the mutually-exclusive
688 * partitioning the user could observe is lost -- downgrade
689 * to OPAQUE. Whole-table BID (@c block_key_n @c == @c 0)
690 * is trivially preserved. */
691 if (info.block_key_n == 0
692 || bid_block_key_preserved(q, sole_relid, &info))
693 out->kind = PROVSQL_TABLE_BID;
694 else
696 } else {
697 out->kind = (provsql_table_kind) info.kind;
698 }
699 }
700 /* If the lookup races and disappears between the two calls,
701 * fall back to OPAQUE. */
702 } else {
703 /* Multiple tracked sources : promote to TID when every source
704 * is TID and the registered ancestor sets are pairwise
705 * disjoint. Any failure (non-TID source, missing registry
706 * entry, ancestor overlap) leaves @c out->kind at OPAQUE,
707 * which is the conservative default already set above. */
710 }
711}
712
714 StringInfoData buf;
715 ListCell *lc;
716 bool first = true;
717
718 initStringInfo(&buf);
719 appendStringInfo(&buf, "query result is %s", kind_label(c->kind));
720
721 /* TID / BID : the source list is complete and tells the user which
722 * tracked relations contribute the per-row uncertainty. OPAQUE :
723 * we deliberately omit sources -- when the shape gate trips on a
724 * sublink, a set operation, GROUP BY, etc., the rtable walk only
725 * reaches the syntactically visible sources, so the list would
726 * be partial and misleadingly suggest completeness. The user
727 * already has the query text in front of them and can identify
728 * which relations are involved without our help. */
729 if (c->kind != PROVSQL_TABLE_OPAQUE) {
730 if (c->source_relids == NIL) {
731 appendStringInfoString(&buf, " (no provenance-tracked sources)");
732 } else {
733 appendStringInfoString(&buf, " (sources: ");
734 foreach (lc, c->source_relids) {
735 Oid relid = lfirst_oid(lc);
736 char *nspname = get_namespace_name(get_rel_namespace(relid));
737 char *relname = get_rel_name(relid);
738
739 if (!first)
740 appendStringInfoString(&buf, ", ");
741 first = false;
742
743 if (nspname != NULL && relname != NULL)
744 appendStringInfo(&buf, "%s.%s",
745 quote_identifier(nspname),
746 quote_identifier(relname));
747 else if (relname != NULL)
748 appendStringInfoString(&buf, quote_identifier(relname));
749 else
750 appendStringInfo(&buf, "<oid %u>", relid);
751 }
752 appendStringInfoChar(&buf, ')');
753 }
754 }
755
756 provsql_notice("%s", buf.data);
757 pfree(buf.data);
758}
provsql_table_kind
How the provenance leaves of a tracked relation are correlated.
@ PROVSQL_TABLE_TID
@ PROVSQL_TABLE_BID
@ PROVSQL_TABLE_OPAQUE
#define PROVSQL_TABLE_INFO_MAX_ANCESTORS
Cap on the number of base ancestors recorded per relation.
static void classify_walk(Query *q, ProvSQLClassification *out, bool *shape_ok, int *n_meta, Oid *sole_relid)
Recursive walker shared by the top-level entry point and the RTE_SUBQUERY descent.
bool provsql_classify_top_level
Backing storage for the provsql.classify_top_level GUC.
static bool collect_union_all_legs(Node *node, Query *parent, List **legs)
Walk a SetOperationStmt tree, collecting each leaf leg's Query body into legs.
static bool try_classify_multi_source_tid(ProvSQLClassification *out)
Conservative multi-source promotion: when every tracked source in out->source_relids is TID and the r...
static bool bid_block_key_preserved(Query *q, Oid source_relid, const ProvenanceTableInfo *info)
Decide whether every block-key column of info survives in q's target list – resolved transitively thr...
void provsql_classify_emit_notice(const ProvSQLClassification *c)
Render the result of provsql_classify_query as a NOTICE.
static bool try_classify_groupby_block_key(Query *q, ProvSQLClassification *out)
Pre-dispatch special case for GROUP BY on a single BID source's block-key columns.
static bool try_classify_union_all(Query *q, ProvSQLClassification *out)
Promote a fully-UNION-ALL Query to TID when each leg classifies as TID and the leg source-relid sets ...
static Node * resolve_through_group_rte(Query *q, Node *e)
PG 18+ helper: when q has a synthetic RTE_GROUP entry (set parseCheckAggregates() appends it for ever...
static bool resolve_var_to_base(Query *q, Index varno, AttrNumber attno, Oid *out_relid, AttrNumber *out_attno)
Resolve a base-level (varno, attno) pair in q transitively through RTE_SUBQUERY layers until reaching...
static bool classify_fromlist_shape_ok(Node *n)
Decide whether a jointree fromlist entry has a shape the classifier can certify : a plain RangeTblRef...
static const char * kind_label(provsql_table_kind k)
Map a provsql_table_kind to its uppercase user-facing label.
void provsql_classify_query(Query *q, ProvSQLClassification *out)
Classify the result relation of a parsed top-level Query.
Public surface of the query-time TID / BID / OPAQUE classifier.
#define provsql_notice(fmt,...)
Emit a ProvSQL informational notice (execution continues).
bool provsql_lookup_ancestry(Oid relid, uint16 *ancestor_n_out, Oid *ancestors_out)
Look up the base-ancestor set of a tracked relation.
bool provsql_lookup_table_info(Oid relid, ProvenanceTableInfo *out)
Look up per-table provenance metadata with a backend-local cache.
Core types, constants, and utilities shared across ProvSQL.
Result of provsql_classify_query.
provsql_table_kind kind
Per-relation metadata for the safe-query optimisation.
AttrNumber block_key[PROVSQL_TABLE_INFO_MAX_BLOCK_KEY]
Block-key column numbers.
uint16_t block_key_n
Number of valid entries in block_key.
uint8_t kind
One of provsql_table_kind.