ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
AnalyticEvaluator.cpp
Go to the documentation of this file.
1/**
2 * @file AnalyticEvaluator.cpp
3 * @brief Implementation of the closed-form CDF resolution pass.
4 * See @c AnalyticEvaluator.h for the full docstring.
5 */
6#include "AnalyticEvaluator.h"
7
8#include <cmath>
9#include <limits>
10#include <optional>
11#include <vector>
12
13#include "Aggregation.h" // ComparisonOperator + cmpOpFromOid
14#include "RandomVariable.h" // parse_distribution_spec, parseDoubleStrict, DistKind
15extern "C" {
16#include "provsql_utils.h" // gate_type
17}
18
19namespace provsql {
20
21double pdfAt(const DistributionSpec &d, double c)
22{
23 double pdf_c = std::numeric_limits<double>::quiet_NaN();
24 switch (d.kind) {
25 case DistKind::Normal: {
26 /* f(c) = (1 / (σ √(2π))) · exp(-(c-μ)² / (2σ²)). Numerically
27 * stable for any finite c; for c far in the tail the exp
28 * underflows to 0 cleanly. */
29 const double mu = d.p1, sigma = d.p2;
30 if (!(sigma > 0.0)) break;
31 static const double SQRT_2PI = std::sqrt(2.0 * M_PI);
32 const double z = (c - mu) / sigma;
33 pdf_c = std::exp(-0.5 * z * z) / (sigma * SQRT_2PI);
34 break;
35 }
36 case DistKind::Uniform: {
37 const double a = d.p1, b = d.p2;
38 if (!(b > a)) break;
39 pdf_c = (c < a || c > b) ? 0.0 : 1.0 / (b - a);
40 break;
41 }
43 const double lambda = d.p1;
44 if (!(lambda > 0.0)) break;
45 pdf_c = (c < 0.0) ? 0.0 : lambda * std::exp(-lambda * c);
46 break;
47 }
48 case DistKind::Erlang: {
49 /* f(c; k, λ) = λ^k · c^(k-1) · e^(-λc) / (k-1)! for c >= 0.
50 * Same integer-k caveat as the CDF: non-integer shapes need
51 * the regularised lower incomplete gamma and are out of scope. */
52 const double s = d.p1, lambda = d.p2;
53 if (s < 1.0 || s != std::floor(s) || !(lambda > 0.0)) break;
54 if (c < 0.0) { pdf_c = 0.0; break; }
55 const unsigned long k = static_cast<unsigned long>(s);
56 /* (k-1)! is small for the typical Erlang shapes (k <= ~20);
57 * compute incrementally to keep precision. */
58 double fact = 1.0;
59 for (unsigned long i = 2; i < k; ++i) fact *= static_cast<double>(i);
60 pdf_c = std::pow(lambda, static_cast<double>(k))
61 * std::pow(c, static_cast<double>(k - 1))
62 * std::exp(-lambda * c)
63 / fact;
64 break;
65 }
66 }
67 return pdf_c;
68}
69
70double cdfAt(const DistributionSpec &d, double c)
71{
72 double cdf_c = std::numeric_limits<double>::quiet_NaN();
73 switch (d.kind) {
74 case DistKind::Normal: {
75 /* Φ((c - μ)/σ) = ½ (1 + erf((c - μ) / (σ √2))). Standard
76 * formula; std::erf is C99 / C++11. */
77 static const double SQRT2 = std::sqrt(2.0);
78 double z = (c - d.p1) / d.p2;
79 cdf_c = 0.5 * (1.0 + std::erf(z / SQRT2));
80 break;
81 }
83 if (c <= d.p1) cdf_c = 0.0;
84 else if (c >= d.p2) cdf_c = 1.0;
85 else cdf_c = (c - d.p1) / (d.p2 - d.p1);
86 break;
88 if (c <= 0.0) cdf_c = 0.0;
89 else cdf_c = -std::expm1(-d.p1 * c); /* 1 - exp(-λc) */
90 break;
91 case DistKind::Erlang: {
92 /* For integer shape s ≥ 1, the Erlang CDF has the finite-sum
93 * form F(c; s, λ) = 1 - e^{-λc} Σ_{n=0..s-1} (λc)^n / n!.
94 * Numerically stable for the small-to-moderate s the simplifier
95 * produces (a SUM of k i.i.d. Exp gates). Non-integer s would
96 * require the regularised lower incomplete gamma function, so
97 * we skip those cases by leaving cdf_c = NaN (caller treats NaN
98 * as "undecided" and the cmp falls through to MC). */
99 const double s = d.p1, lambda = d.p2;
100 if (s < 1.0 || s != std::floor(s)) break;
101 if (c <= 0.0) { cdf_c = 0.0; break; }
102 const double lc = lambda * c;
103 double term = 1.0; /* (λc)^0 / 0! */
104 double sum = 1.0;
105 const unsigned long k = static_cast<unsigned long>(s);
106 for (unsigned long n = 1; n < k; ++n) {
107 term *= lc / static_cast<double>(n);
108 sum += term;
109 }
110 cdf_c = 1.0 - std::exp(-lc) * sum;
111 break;
112 }
113 }
114 return cdf_c;
115}
116
117namespace {
118
119/* All four ordered comparators reduce to either F(c) or 1 - F(c)
120 * (continuous: @c < and @c <= have the same probability, ditto @c >
121 * and @c >=). EQ / NE on continuous RVs are handled universally by
122 * RangeCheck (P(X = c) = 0, P(X != c) = 1, sound in every semiring
123 * via gate_zero / gate_one); they should never reach this function. */
124double cdfDecide(const DistributionSpec &d, ComparisonOperator op, double c)
125{
126 double cdf_c = cdfAt(d, c);
127 if (std::isnan(cdf_c)) return cdf_c;
128
129 switch (op) {
132 return cdf_c;
135 return 1.0 - cdf_c;
138 /* Should have been handled upstream by RangeCheck; if we still
139 * see one here it means RangeCheck did not run (e.g.
140 * provsql.simplify_on_load is off). Fall through to undecided
141 * rather than silently make an inconsistent choice. */
142 return std::numeric_limits<double>::quiet_NaN();
143 }
144 return std::numeric_limits<double>::quiet_NaN();
145}
146
147/* Mirror @c provsql_having_detail::flip_op without taking the
148 * dependency on @c having_semantics from this file. Used to
149 * normalise @c c @c cmp @c X into @c X @c flip(cmp) @c c. */
151{
152 switch (op) {
159 }
160 return op;
161}
162
163/* X cmp Y for X, Y independent normals. Reduces to (X - Y) cmp 0
164 * with X - Y ~ N(μ_X - μ_Y, σ_X² + σ_Y²). */
165double normalDiffDecide(const DistributionSpec &X,
166 const DistributionSpec &Y,
168{
169 DistributionSpec diff;
170 diff.kind = DistKind::Normal;
171 diff.p1 = X.p1 - Y.p1;
172 diff.p2 = std::sqrt(X.p2 * X.p2 + Y.p2 * Y.p2);
173 return cdfDecide(diff, op, 0.0);
174}
175
176/* Try to parse a @c gate_value's extra as a double. Returns NaN on
177 * any failure (caller treats NaN as "skip this cmp"). */
178double bareValue(const GenericCircuit &gc, gate_t g)
179{
180 if (gc.getGateType(g) != gate_value)
181 return std::numeric_limits<double>::quiet_NaN();
182 try { return parseDoubleStrict(gc.getExtra(g)); }
183 catch (const CircuitException &) {
184 return std::numeric_limits<double>::quiet_NaN();
185 }
186}
187
188/* Try to parse a @c gate_rv's distribution spec. Returns @c
189 * std::nullopt on any failure. */
190std::optional<DistributionSpec>
191bareRv(const GenericCircuit &gc, gate_t g)
192{
193 if (gc.getGateType(g) != gate_rv)
194 return std::nullopt;
195 return parse_distribution_spec(gc.getExtra(g));
196}
197
198/* Closed-form P(X cmp c) for a categorical-form gate_mixture X. X's
199 * wires are [key, mul_1, ..., mul_n]; each mul_i carries its
200 * probability in set_prob and its outcome value in extra (parsed as
201 * float8). The probability is just the sum of π_i over mulinputs
202 * whose value satisfies the predicate.
203 *
204 * EQ / NE are exact too in this setting (X = c iff some outcome equals
205 * c with positive mass): the RangeCheck pre-pass treats EQ / NE over
206 * continuous RVs as P=0 / P=1, but a categorical is discrete so we
207 * decide them here. Returns NaN if any mulinput's extra fails to
208 * parse as a finite float8 -- the cmp then falls through to MC. */
209double categoricalDecide(const GenericCircuit &gc, gate_t mix,
210 ComparisonOperator op, double c)
211{
212 const auto &wires = gc.getWires(mix);
213 double p = 0.0;
214 for (std::size_t i = 1; i < wires.size(); ++i) {
215 double v;
216 try { v = parseDoubleStrict(gc.getExtra(wires[i])); }
217 catch (const CircuitException &) {
218 return std::numeric_limits<double>::quiet_NaN();
219 }
220 bool hit = false;
221 switch (op) {
222 case ComparisonOperator::LT: hit = v < c; break;
223 case ComparisonOperator::LE: hit = v <= c; break;
224 case ComparisonOperator::GT: hit = v > c; break;
225 case ComparisonOperator::GE: hit = v >= c; break;
226 case ComparisonOperator::EQ: hit = v == c; break;
227 case ComparisonOperator::NE: hit = v != c; break;
228 }
229 if (hit) p += gc.getProb(wires[i]);
230 }
231 return p;
232}
233
234/**
235 * @brief Try to decide @p cmp_gate via a closed-form CDF.
236 *
237 * Recognised shapes:
238 * - @c X @c cmp @c c (X a bare @c gate_rv, c a bare @c gate_value)
239 * - @c c @c cmp @c X (mirror of the above; flip the comparator)
240 * - @c X @c cmp @c Y where both @c X and @c Y are bare normal
241 * @c gate_rv leaves with distinct UUIDs (independence test)
242 *
243 * Returns the analytical probability in [0, 1] when decided,
244 * @c NaN otherwise.
245 */
246double tryAnalyticDecide(const GenericCircuit &gc, gate_t cmp_gate)
247{
248 bool ok = false;
249 ComparisonOperator op = cmpOpFromOid(gc.getInfos(cmp_gate).first, ok);
250 if (!ok) return std::numeric_limits<double>::quiet_NaN();
251
252 const auto &wires = gc.getWires(cmp_gate);
253 if (wires.size() != 2) return std::numeric_limits<double>::quiet_NaN();
254 gate_t lhs = wires[0], rhs = wires[1];
255
256 /* X cmp c */
257 if (auto specX = bareRv(gc, lhs)) {
258 double c = bareValue(gc, rhs);
259 if (!std::isnan(c)) return cdfDecide(*specX, op, c);
260 }
261
262 /* c cmp X */
263 if (auto specX = bareRv(gc, rhs)) {
264 double c = bareValue(gc, lhs);
265 if (!std::isnan(c)) return cdfDecide(*specX, flipCmpOp(op), c);
266 }
267
268 /* Categorical mixture cmp constant: exact sum of mass over the
269 * mulinputs whose value satisfies the predicate. EQ / NE are
270 * meaningful on a discrete distribution and decided here rather
271 * than the continuous-default route RangeCheck takes. */
272 if (gc.isCategoricalMixture(lhs)) {
273 double c = bareValue(gc, rhs);
274 if (!std::isnan(c)) return categoricalDecide(gc, lhs, op, c);
275 }
276 if (gc.isCategoricalMixture(rhs)) {
277 double c = bareValue(gc, lhs);
278 if (!std::isnan(c)) return categoricalDecide(gc, rhs, flipCmpOp(op), c);
279 }
280
281 /* X cmp Y, both bare normal RVs. The @c X cmp X same-UUID case
282 * is handled upstream by RangeCheck's identity shortcut, so by the
283 * time we get here distinct UUIDs implies independence (each
284 * @c provsql.normal call mints a fresh @c uuid_generate_v4 token). */
285 {
286 auto specX = bareRv(gc, lhs);
287 auto specY = bareRv(gc, rhs);
288 if (specX && specY &&
289 specX->kind == DistKind::Normal &&
290 specY->kind == DistKind::Normal) {
291 return normalDiffDecide(*specX, *specY, op);
292 }
293 }
294
295 return std::numeric_limits<double>::quiet_NaN();
296}
297
298} // namespace
299
301{
302 unsigned resolved = 0;
303 const auto nb = gc.getNbGates();
304
305 /* Snapshot the cmp-gate ids so in-place rewrites don't affect the
306 * iteration: same pattern as @c runRangeCheck. */
307 std::vector<gate_t> cmps;
308 for (std::size_t i = 0; i < nb; ++i) {
309 auto g = static_cast<gate_t>(i);
310 if (gc.getGateType(g) == gate_cmp)
311 cmps.push_back(g);
312 }
313
314 for (gate_t c : cmps) {
315 if (gc.getGateType(c) != gate_cmp) continue;
316 double p = tryAnalyticDecide(gc, c);
317 if (!std::isnan(p)) {
318 /* Clamp to [0, 1] defensively: floating-point CDF roundoff
319 * could in principle produce values marginally outside the
320 * unit interval (1 - F(c) for c far in the right tail). */
321 if (p < 0.0) p = 0.0;
322 if (p > 1.0) p = 1.0;
323 gc.resolveCmpToBernoulli(c, p);
324 ++resolved;
325 }
326 }
327
328 return resolved;
329}
330
331} // namespace provsql
ComparisonOperator cmpOpFromOid(Oid op_oid, bool &ok)
Map a PostgreSQL comparison-operator OID to a ComparisonOperator.
Typed aggregation value, operator, and aggregator abstractions.
ComparisonOperator
SQL comparison operators used in gate_cmp circuit gates.
Definition Aggregation.h:38
@ LT
Less than (<).
Definition Aggregation.h:42
@ GT
Greater than (>).
Definition Aggregation.h:44
@ LE
Less than or equal (<=).
Definition Aggregation.h:41
@ NE
Not equal (<>).
Definition Aggregation.h:40
@ GE
Greater than or equal (>=).
Definition Aggregation.h:43
Closed-form CDF resolution for trivial gate_cmp shapes.
gate_t
Strongly-typed gate identifier.
Definition Circuit.h:49
Continuous random-variable helpers (distribution parsing, moments).
std::vector< gate_t > & getWires(gate_t g)
Return a mutable reference to the child-wire list of gate g.
Definition Circuit.h:140
gateType getGateType(gate_t g) const
Return the type of gate g.
Definition Circuit.h:130
std::vector< gate_t >::size_type getNbGates() const
Return the total number of gates in the circuit.
Definition Circuit.h:103
In-memory provenance circuit with semiring-generic evaluation.
bool isCategoricalMixture(gate_t g) const
Test whether g is a categorical-form gate_mixture (the explicit provsql.categorical output).
std::string getExtra(gate_t g) const
Return the string extra for gate g.
double getProb(gate_t g) const
Return the probability for gate g.
void resolveCmpToBernoulli(gate_t g, double p)
Replace a gate_cmp by a constant Boolean leaf (gate_one for p == 1, gate_zero for p == 0) or by a Ber...
std::pair< unsigned, unsigned > getInfos(gate_t g) const
Return the integer annotation pair for gate g.
@ Normal
Normal (Gaussian): p1=μ, p2=σ
@ Exponential
Exponential: p1=λ, p2 unused.
@ Uniform
Uniform on [a,b]: p1=a, p2=b.
@ Erlang
Erlang: p1=k (positive integer), p2=λ.
double parseDoubleStrict(const std::string &s)
Strictly parse s as a double.
double pdfAt(const DistributionSpec &d, double c)
Closed-form probability density for a basic distribution.
std::optional< DistributionSpec > parse_distribution_spec(const std::string &s)
Parse the on-disk text encoding of a gate_rv distribution.
double cdfAt(const DistributionSpec &d, double c)
Closed-form CDF for a basic continuous distribution.
unsigned runAnalyticEvaluator(GenericCircuit &gc)
Run the closed-form CDF resolution pass over gc.
Core types, constants, and utilities shared across ProvSQL.
@ gate_rv
Continuous random-variable leaf (extra encodes distribution).
Parsed distribution spec (kind + up to two parameters).
double p2
Second parameter (σ or b; unused for Exponential).
double p1
First parameter (μ, a, or λ).