ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
RvAnalyticalCurves.cpp
Go to the documentation of this file.
1/**
2 * @file RvAnalyticalCurves.cpp
3 * @brief SQL function `provsql.rv_analytical_curves(token, samples, prov)`.
4 *
5 * Returns a JSON object with closed-form curves for the (possibly
6 * conditional) distribution rooted at @p token, or @c NULL when no
7 * closed form applies. The payload has up to three fields:
8 *
9 * - @c pdf — @p samples evenly-spaced @c {x, p} points covering the
10 * continuous part of the distribution. Absent for pure-discrete
11 * shapes (Dirac / categorical).
12 * - @c cdf — same x grid as @c pdf, with cumulative probability.
13 * - @c stems — point masses @c {x, p} produced by Dirac (@c gate_value
14 * wrapped as an @c as_random) or categorical roots, or by Dirac/
15 * categorical arms inside a Bernoulli mixture. Weights propagate
16 * through nested mixtures (each ancestor's @c p / 1-p applies).
17 *
18 * Used by ProvSQL Studio's Distribution profile panel to overlay
19 * analytical curves and point-mass discs on the empirical histogram
20 * drawn from @c rv_histogram.
21 *
22 * Supported shapes:
23 * - bare @c gate_rv root (Normal / Uniform / Exponential / Erlang
24 * with integer shape), optionally truncated by an AND-conjunct
25 * event extracted via @c collectRvConstraints;
26 * - Dirac point (@c gate_value with finite extra, surfaced by
27 * @c provsql.as_random);
28 * - categorical-form @c gate_mixture (one @c {key, mul_1..n});
29 * - classic Bernoulli @c gate_mixture (@c [p_token, x, y]) over any
30 * two recursively-matched shapes; @c p_token must be a bare
31 * @c gate_input (compound Boolean @c p bails).
32 *
33 * Truncation is honoured only on the bare-RV path; mixtures /
34 * categoricals / Diracs are matched only with a trivial event.
35 *
36 * @see provsql::matchClosedFormDistribution in RangeCheck.h
37 */
38extern "C" {
39#include "postgres.h"
40#include "fmgr.h"
41#include "utils/jsonb.h"
42#include "utils/fmgrprotos.h"
43#include "utils/uuid.h"
44#include "provsql_utils.h"
45#include "provsql_error.h"
46
47PG_FUNCTION_INFO_V1(rv_analytical_curves);
48}
49
50#include "AnalyticEvaluator.h" // pdfAt, cdfAt
51#include "CircuitFromMMap.h" // getJointCircuit
52#include "GenericCircuit.h"
53#include "HybridEvaluator.h" // runHybridSimplifier
54#include "RandomVariable.h" // DistKind
55#include "RangeCheck.h" // matchClosedFormDistribution + variant
56#include "provsql_utils_cpp.h"
57
58#include <algorithm>
59#include <cmath>
60#include <iomanip>
61#include <limits>
62#include <optional>
63#include <sstream>
64#include <type_traits>
65#include <utility>
66#include <variant>
67#include <vector>
68
69namespace {
70
71/**
72 * @brief Choose a sensible x-range for the continuous curve given a
73 * single-RV spec and an optional truncation.
74 *
75 * Unbounded distributions (Normal) get a heuristic window around the
76 * mean; bounded distributions (Uniform) get a slight padding so the
77 * support boundary doesn't sit flush with the SVG edge; one-sided
78 * supports (Exponential, Erlang) get @c 6/λ on the right.
79 *
80 * Truncation clamps the window: the curve never extends past the
81 * conditioning event's interval.
82 */
83std::pair<double, double>
84bare_x_range(const provsql::DistributionSpec &spec,
85 double trunc_lo, double trunc_hi)
86{
87 double lo = trunc_lo, hi = trunc_hi;
88 switch (spec.kind) {
90 const double mu = spec.p1, sigma = spec.p2;
91 if (!std::isfinite(lo)) lo = mu - 4.0 * sigma;
92 if (!std::isfinite(hi)) hi = mu + 4.0 * sigma;
93 break;
94 }
96 const double a = spec.p1, b = spec.p2;
97 const double pad = 0.15 * (b - a);
98 if (!std::isfinite(lo)) lo = a - pad;
99 else lo = std::max(lo, a - pad);
100 if (!std::isfinite(hi)) hi = b + pad;
101 else hi = std::min(hi, b + pad);
102 break;
103 }
105 const double lambda = spec.p1;
106 if (!std::isfinite(lo)) lo = 0.0;
107 if (!std::isfinite(hi)) hi = 6.0 / lambda;
108 break;
109 }
111 const double k = spec.p1, lambda = spec.p2;
112 if (!std::isfinite(lo)) lo = 0.0;
113 if (!std::isfinite(hi)) hi = std::max(2.0 * k / lambda, 6.0 / lambda);
114 break;
115 }
116 }
117 return {lo, hi};
118}
119
120/**
121 * @brief Per-sample truncated PDF for a single-RV arm. Returns the
122 * unconditional value when @c truncated == @c false. Yields
123 * @c NaN when the closed-form PDF doesn't cover the spec
124 * (e.g. non-integer Erlang shape, propagated from
125 * @c provsql::pdfAt).
126 */
127double bare_pdf(const provsql::TruncatedSingleRv &t, double x)
128{
129 double p = provsql::pdfAt(t.spec, x);
130 if (std::isnan(p)) return std::numeric_limits<double>::quiet_NaN();
131 if (!t.truncated) return p;
132 if (x < t.lo || x > t.hi) return 0.0;
133 const double cdf_lo = std::isfinite(t.lo) ? provsql::cdfAt(t.spec, t.lo) : 0.0;
134 const double cdf_hi = std::isfinite(t.hi) ? provsql::cdfAt(t.spec, t.hi) : 1.0;
135 const double Z = cdf_hi - cdf_lo;
136 if (!(Z > 0.0)) return std::numeric_limits<double>::quiet_NaN();
137 return p / Z;
138}
139
140double bare_cdf(const provsql::TruncatedSingleRv &t, double x)
141{
142 double c = provsql::cdfAt(t.spec, x);
143 if (std::isnan(c)) return std::numeric_limits<double>::quiet_NaN();
144 if (!t.truncated) return c;
145 if (x < t.lo) return 0.0;
146 if (x > t.hi) return 1.0;
147 const double cdf_lo = std::isfinite(t.lo) ? provsql::cdfAt(t.spec, t.lo) : 0.0;
148 const double cdf_hi = std::isfinite(t.hi) ? provsql::cdfAt(t.spec, t.hi) : 1.0;
149 const double Z = cdf_hi - cdf_lo;
150 if (!(Z > 0.0)) return std::numeric_limits<double>::quiet_NaN();
151 return (c - cdf_lo) / Z;
152}
153
154/**
155 * @brief Recursive @c pdf(x) over the @c ClosedFormShape variant.
156 * Dirac / categorical arms contribute 0 (point masses live in
157 * the @c stems channel, not in the continuous PDF). Mixtures
158 * combine arms linearly with the Bernoulli weight.
159 */
160double shape_pdf(const provsql::ClosedFormShape &s, double x);
161double shape_cdf(const provsql::ClosedFormShape &s, double x);
162
163double shape_pdf(const provsql::ClosedFormShape &s, double x)
164{
165 return std::visit([&](const auto &v) -> double {
166 using T = std::decay_t<decltype(v)>;
167 if constexpr (std::is_same_v<T, provsql::TruncatedSingleRv>) {
168 return bare_pdf(v, x);
169 } else if constexpr (std::is_same_v<T, provsql::DiracShape>) {
170 (void)x;
171 return 0.0;
172 } else if constexpr (std::is_same_v<T, provsql::CategoricalShape>) {
173 (void)x;
174 return 0.0;
175 } else if constexpr (std::is_same_v<T, provsql::BernoulliMixtureShape>) {
176 const double pl = shape_pdf(*v.left, x);
177 const double pr = shape_pdf(*v.right, x);
178 if (std::isnan(pl) || std::isnan(pr))
179 return std::numeric_limits<double>::quiet_NaN();
180 return v.p * pl + (1.0 - v.p) * pr;
181 }
182 return std::numeric_limits<double>::quiet_NaN();
183 }, s);
184}
185
186double shape_cdf(const provsql::ClosedFormShape &s, double x)
187{
188 return std::visit([&](const auto &v) -> double {
189 using T = std::decay_t<decltype(v)>;
190 if constexpr (std::is_same_v<T, provsql::TruncatedSingleRv>) {
191 return bare_cdf(v, x);
192 } else if constexpr (std::is_same_v<T, provsql::DiracShape>) {
193 return (x >= v.value) ? 1.0 : 0.0;
194 } else if constexpr (std::is_same_v<T, provsql::CategoricalShape>) {
195 double sum = 0.0;
196 for (const auto &pr : v.outcomes) if (pr.first <= x) sum += pr.second;
197 return sum;
198 } else if constexpr (std::is_same_v<T, provsql::BernoulliMixtureShape>) {
199 const double cl = shape_cdf(*v.left, x);
200 const double cr = shape_cdf(*v.right, x);
201 if (std::isnan(cl) || std::isnan(cr))
202 return std::numeric_limits<double>::quiet_NaN();
203 return v.p * cl + (1.0 - v.p) * cr;
204 }
205 return std::numeric_limits<double>::quiet_NaN();
206 }, s);
207}
208
209bool shape_has_continuous(const provsql::ClosedFormShape &s)
210{
211 return std::visit([](const auto &v) -> bool {
212 using T = std::decay_t<decltype(v)>;
213 if constexpr (std::is_same_v<T, provsql::TruncatedSingleRv>) return true;
214 else if constexpr (std::is_same_v<T, provsql::DiracShape>) return false;
215 else if constexpr (std::is_same_v<T, provsql::CategoricalShape>) return false;
216 else if constexpr (std::is_same_v<T, provsql::BernoulliMixtureShape>)
217 return shape_has_continuous(*v.left) || shape_has_continuous(*v.right);
218 return false;
219 }, s);
220}
221
222/**
223 * @brief Walk the shape collecting weighted stem points. @p weight
224 * is the running Bernoulli product from the path root; the
225 * leaf-level mass is multiplied by it so e.g. a Dirac inside
226 * @c mixture(0.3, X, c) appears at @c (c, 0.7).
227 */
228void shape_stems(const provsql::ClosedFormShape &s, double weight,
229 std::vector<std::pair<double, double>> &out)
230{
231 std::visit([&](const auto &v) {
232 using T = std::decay_t<decltype(v)>;
233 if constexpr (std::is_same_v<T, provsql::TruncatedSingleRv>) {
234 (void)v; // continuous arm: contributes no stems
235 } else if constexpr (std::is_same_v<T, provsql::DiracShape>) {
236 out.emplace_back(v.value, weight);
237 } else if constexpr (std::is_same_v<T, provsql::CategoricalShape>) {
238 for (const auto &pr : v.outcomes)
239 out.emplace_back(pr.first, weight * pr.second);
240 } else if constexpr (std::is_same_v<T, provsql::BernoulliMixtureShape>) {
241 shape_stems(*v.left, weight * v.p, out);
242 shape_stems(*v.right, weight * (1.0 - v.p), out);
243 }
244 }, s);
245}
246
247std::pair<double, double> shape_x_range(const provsql::ClosedFormShape &s)
248{
249 return std::visit([](const auto &v) -> std::pair<double, double> {
250 using T = std::decay_t<decltype(v)>;
251 if constexpr (std::is_same_v<T, provsql::TruncatedSingleRv>) {
252 return bare_x_range(v.spec, v.lo, v.hi);
253 } else if constexpr (std::is_same_v<T, provsql::DiracShape>) {
254 /* Pure Dirac: pad ±1 around the point so the disc isn't flush
255 * against the SVG edge. When this Dirac is nested under a
256 * mixture, the sibling's range usually dominates. */
257 return {v.value - 1.0, v.value + 1.0};
258 } else if constexpr (std::is_same_v<T, provsql::CategoricalShape>) {
259 double mn = std::numeric_limits<double>::infinity();
260 double mx = -std::numeric_limits<double>::infinity();
261 for (const auto &pr : v.outcomes) {
262 mn = std::min(mn, pr.first);
263 mx = std::max(mx, pr.first);
264 }
265 const double range = mx - mn;
266 const double pad = range > 0.0 ? 0.1 * range : 1.0;
267 return {mn - pad, mx + pad};
268 } else if constexpr (std::is_same_v<T, provsql::BernoulliMixtureShape>) {
269 const auto L = shape_x_range(*v.left);
270 const auto R = shape_x_range(*v.right);
271 return {std::min(L.first, R.first), std::max(L.second, R.second)};
272 }
273 return {0.0, 1.0};
274 }, s);
275}
276
277} // namespace
278
279extern "C" Datum
280rv_analytical_curves(PG_FUNCTION_ARGS)
281{
282 pg_uuid_t *token = (pg_uuid_t *) PG_GETARG_POINTER(0);
283 int32 samples = PG_GETARG_INT32(1);
284 pg_uuid_t *prov = (pg_uuid_t *) PG_GETARG_POINTER(2);
285
286 if (samples < 2)
288 "rv_analytical_curves: samples must be at least 2 (got %d)",
289 samples);
290
291 try {
292 gate_t root_gate, event_gate;
294 try {
295 gc = getJointCircuit(*token, *prov, root_gate, event_gate);
296 } catch (const CircuitException &) {
297 PG_RETURN_NULL();
298 }
299
300 /* Run the hybrid-evaluator simplifier so the analytical curves
301 * see the same folded tree Studio's circuit view shows via
302 * simplified_circuit_subgraph: c·Exp(λ) → Exp(λ/c), N(μ,σ)+N(...)
303 * → single normal, Erlang sums, etc. Without this pass the
304 * c·Exp(λ) root would be a gate_arith composite that
305 * matchClosedFormDistribution does not match, so the panel would
306 * silently fall back to histogram-only on a circuit that looks
307 * like a single Exp node. */
310
311 /* Generalised closed-form match: bare RV (with optional
312 * truncation), Dirac, categorical, or Bernoulli mixture over any
313 * recursively-matched shape. Non-matched shapes (gate_arith
314 * composites, mismatched Erlang shapes, ...) fall through to
315 * NULL so the front-end renders histogram-only without a
316 * structural pre-check. */
318 gc, root_gate, std::optional<gate_t>{event_gate});
319 if (!shape) PG_RETURN_NULL();
320
321 std::vector<std::pair<double, double>> stems;
322 shape_stems(*shape, 1.0, stems);
323 const bool has_cont = shape_has_continuous(*shape);
324
325 /* Nothing to render: shouldn't normally happen (shape matched
326 * but produced neither continuous nor discrete output), but
327 * guards against an empty stem list from a categorical with all
328 * zero-mass outcomes etc. */
329 if (!has_cont && stems.empty()) PG_RETURN_NULL();
330
331 /* x-range chosen over the full shape; for a mixture this is the
332 * union of branch ranges, so the curve covers both modes. A
333 * pure-stems shape still gets a small window for the chart axis. */
334 auto [x_lo, x_hi] = shape_x_range(*shape);
335 if (!(x_lo < x_hi)) PG_RETURN_NULL();
336
337 std::ostringstream out;
338 /* setprecision(17) keeps each sample bit-round-trippable through
339 * jsonb_in's parser, matching the convention used by rv_histogram
340 * for its bin_lo / bin_hi fields. */
341 out << std::setprecision(17);
342 out << '{';
343 bool first_field = true;
344
345 /* CDF is well-defined for every supported shape (a staircase for
346 * pure-discrete, a smooth curve for continuous, a curve-with-
347 * jumps for mixed), so emit it unconditionally. PDF is only
348 * meaningful when there's a continuous component; for pure
349 * point-mass shapes the pdf samples would all be zero and the
350 * smooth overlay path would be meaningless. */
351 std::ostringstream pdf_out;
352 pdf_out << std::setprecision(17);
353 if (has_cont) pdf_out << "\"pdf\":[";
354 out << "\"cdf\":[";
355 for (int i = 0; i < samples; ++i) {
356 const double t = static_cast<double>(i) / (samples - 1);
357 const double x = x_lo + t * (x_hi - x_lo);
358 const double cdf_x = shape_cdf(*shape, x);
359 if (std::isnan(cdf_x)) PG_RETURN_NULL();
360 if (i > 0) out << ',';
361 out << "{\"x\":" << x << ",\"p\":" << cdf_x << '}';
362 if (has_cont) {
363 const double pdf_x = shape_pdf(*shape, x);
364 if (std::isnan(pdf_x)) PG_RETURN_NULL();
365 if (i > 0) pdf_out << ',';
366 pdf_out << "{\"x\":" << x << ",\"p\":" << pdf_x << '}';
367 }
368 }
369 out << ']';
370 if (has_cont) {
371 pdf_out << ']';
372 out << ',' << pdf_out.str();
373 }
374 first_field = false;
375
376 if (!stems.empty()) {
377 if (!first_field) out << ',';
378 out << "\"stems\":[";
379 for (std::size_t i = 0; i < stems.size(); ++i) {
380 if (i > 0) out << ',';
381 out << "{\"x\":" << stems[i].first
382 << ",\"p\":" << stems[i].second << '}';
383 }
384 out << ']';
385 }
386 out << '}';
387
388 Datum json = DirectFunctionCall1(
389 jsonb_in, CStringGetDatum(pstrdup(out.str().c_str())));
390 PG_RETURN_DATUM(json);
391 } catch (const std::exception &e) {
392 provsql_error("rv_analytical_curves: %s", e.what());
393 } catch (...) {
394 provsql_error("rv_analytical_curves: unknown exception");
395 }
396 PG_RETURN_NULL();
397}
Closed-form CDF resolution for trivial gate_cmp shapes.
GenericCircuit getJointCircuit(pg_uuid_t root_token, pg_uuid_t event_token, gate_t &root_gate, gate_t &event_gate)
Build a GenericCircuit containing the closures of two roots, with shared subgraphs unified.
Build in-memory circuits from the mmap-backed persistent store.
gate_t
Strongly-typed gate identifier.
Definition Circuit.h:49
Semiring-agnostic in-memory provenance circuit.
Peephole simplifier for continuous gate_arith sub-circuits.
Continuous random-variable helpers (distribution parsing, moments).
Support-based bound check for continuous-RV comparators.
Datum rv_analytical_curves(PG_FUNCTION_ARGS)
Exception type thrown by circuit operations on invalid input.
Definition Circuit.h:206
In-memory provenance circuit with semiring-generic evaluation.
@ Normal
Normal (Gaussian): p1=μ, p2=σ
@ Exponential
Exponential: p1=λ, p2 unused.
@ Uniform
Uniform on [a,b]: p1=a, p2=b.
@ Erlang
Erlang: p1=k (positive integer), p2=λ.
std::optional< ClosedFormShape > matchClosedFormDistribution(const GenericCircuit &gc, gate_t root, std::optional< gate_t > event_root)
Detect any of the closed-form shapes supported by rv_analytical_curves.
unsigned runHybridSimplifier(GenericCircuit &gc)
Run the peephole simplifier over gc.
double pdfAt(const DistributionSpec &d, double c)
Closed-form probability density for a basic distribution.
std::variant< TruncatedSingleRv, DiracShape, CategoricalShape, BernoulliMixtureShape > ClosedFormShape
One of the closed-form shapes the analytical-curves payload can render: bare RV (continuous PDF/CDF),...
Definition RangeCheck.h:200
double cdfAt(const DistributionSpec &d, double c)
Closed-form CDF for a basic continuous distribution.
bool provsql_hybrid_evaluation
Run the hybrid-evaluator simplifier inside probability_evaluate; controlled by the provsql....
Definition provsql.c:84
Uniform error-reporting macros for ProvSQL.
#define provsql_error(fmt,...)
Report a fatal ProvSQL error and abort the current transaction.
Core types, constants, and utilities shared across ProvSQL.
C++ utility functions for UUID manipulation.
UUID structure.
Parsed distribution spec (kind + up to two parameters).
double p2
Second parameter (σ or b; unused for Exponential).
double p1
First parameter (μ, a, or λ).
Detection result for a closed-form, optionally-truncated single-RV shape.
Definition RangeCheck.h:101
double lo
Lower bound (-INF if unbounded).
Definition RangeCheck.h:103
DistributionSpec spec
Parsed kind + parameters.
Definition RangeCheck.h:102
double hi
Upper bound (+INF if unbounded).
Definition RangeCheck.h:104
bool truncated
True iff the bounds came from a non-trivial event_root.
Definition RangeCheck.h:105