ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
RvHistogram.cpp
Go to the documentation of this file.
1/**
2 * @file RvHistogram.cpp
3 * @brief SQL function `provsql.rv_histogram(token, bins)`.
4 *
5 * Returns an empirical histogram of the scalar values produced by the
6 * sub-circuit rooted at @c token, binned into @p bins equal-width
7 * intervals between the observed minimum and maximum. The sample
8 * count is taken from @c provsql.rv_mc_samples; pinning
9 * @c provsql.monte_carlo_seed makes the result reproducible.
10 *
11 * Output: a jsonb array of objects @c {bin_lo, bin_hi, count}, one per
12 * non-empty bin (zero-count bins are still emitted so the client can
13 * draw a flat baseline). Returning jsonb rather than @c SETOF record
14 * keeps the C++ side free of SRF / FuncCallContext mechanics and
15 * matches the pattern used by @c simplified_circuit_subgraph.
16 *
17 * Accepted root gate types:
18 * - @c gate_value: degenerate Dirac at the constant. Emits a single
19 * bin at @c (v, v) with count equal to the would-be sample count,
20 * so the client can normalise to a probability mass without a
21 * special case.
22 * - @c gate_rv: sampled from the leaf's distribution.
23 * - @c gate_arith: sampled by recursing through the arithmetic DAG,
24 * reusing @c gate_rv draws within an iteration so shared leaves
25 * are correctly correlated.
26 * - @c gate_mixture: sampled by recursing through the mixture's
27 * Bernoulli (gate_input) wire and the selected scalar branch,
28 * reusing per-iteration caches so a shared p_token across the
29 * circuit produces coupled draws.
30 *
31 * Any other gate type raises: probability of a Boolean-valued gate is
32 * a scalar that the existing @c probability_evaluate dispatch covers,
33 * and aggregation roots have their own moment family in
34 * @c agg_raw_moment.
35 */
36extern "C" {
37#include "postgres.h"
38#include "fmgr.h"
39#include "utils/jsonb.h"
40#include "utils/fmgrprotos.h"
41#include "utils/uuid.h"
42#include "provsql_utils.h"
43#include "provsql_error.h"
44
45PG_FUNCTION_INFO_V1(rv_histogram);
46}
47
48#include "CircuitFromMMap.h"
49#include "GenericCircuit.h"
50#include "MonteCarloSampler.h"
51#include "RandomVariable.h"
52#include "RangeCheck.h"
53#include "provsql_utils_cpp.h"
54
55#include <algorithm>
56#include <cmath>
57#include <iomanip>
58#include <optional>
59#include <sstream>
60#include <string>
61#include <utility>
62#include <vector>
63
64namespace {
65
66void emit_bin(std::ostringstream &out, bool &first,
67 double lo, double hi, unsigned count)
68{
69 if (!first) out << ',';
70 first = false;
71 out << "{\"bin_lo\":" << lo
72 << ",\"bin_hi\":" << hi
73 << ",\"count\":" << count << '}';
74}
75
76} // namespace
77
78extern "C" Datum
79rv_histogram(PG_FUNCTION_ARGS)
80{
81 pg_uuid_t *root_arg = (pg_uuid_t *) PG_GETARG_POINTER(0);
82 int bins = PG_GETARG_INT32(1);
83 pg_uuid_t *prov_arg = (pg_uuid_t *) PG_GETARG_POINTER(2);
84
85 if (bins <= 0)
86 provsql_error("rv_histogram: bins must be positive (got %d)", bins);
87
88 std::ostringstream out;
89 /* setprecision(17) preserves full double round-trip through the
90 * jsonb_in parser, so the client sees the same bin edges that the
91 * sampler produced. */
92 out << std::setprecision(17);
93 out << '[';
94 bool first = true;
95
96 try {
97 /* Always go through getJointCircuit: when prov is gate_one() the
98 * joint loader still produces a valid single-root closure (the
99 * gate_one leaf is just an extra disconnected node). This keeps
100 * a single code path for shared-leaf coupling between the
101 * indicator (event_gate) and the value (root_gate) in the
102 * conditional case. */
103 gate_t root_gate, event_gate;
105 try {
106 gc = getJointCircuit(*root_arg, *prov_arg, root_gate, event_gate);
107 } catch (const CircuitException &) {
108 out << ']';
109 Datum json = DirectFunctionCall1(
110 jsonb_in, CStringGetDatum(pstrdup(out.str().c_str())));
111 PG_RETURN_DATUM(json);
112 }
113
114 /* gate_one event = unconditional. */
115 const bool conditional = gc.getGateType(event_gate) != gate_one;
116 std::optional<gate_t> event_opt;
117 if (conditional) event_opt = event_gate;
118
119 const gate_type t = gc.getGateType(root_gate);
120
121 if (t == gate_value) {
122 /* Dirac: parse the literal, emit one degenerate bin. Count
123 * mirrors rv_mc_samples (or 1 if MC is disabled) so the client
124 * can compute relative mass the same way for every gate kind. */
125 const double v = provsql::parseDoubleStrict(gc.getExtra(root_gate));
126 const unsigned n = provsql_rv_mc_samples > 0
127 ? static_cast<unsigned>(provsql_rv_mc_samples)
128 : 1u;
129 emit_bin(out, first, v, v, n);
130 } else if (t == gate_rv || t == gate_arith || t == gate_mixture) {
131 if (provsql_rv_mc_samples <= 0)
133 "rv_histogram: provsql.rv_mc_samples = 0 disables sampling; "
134 "raise it above 0 to compute a histogram");
135 const unsigned N = static_cast<unsigned>(provsql_rv_mc_samples);
136
137 std::vector<double> samples;
138 if (conditional) {
139 /* Closed-form truncation fast path: when the root is a bare
140 * gate_rv of a supported family and the event reduces to an
141 * interval on it, draw exactly @c N samples from the truncated
142 * distribution. 100% acceptance, so the "accepted 0 of N"
143 * error below no longer fires on tight events that previously
144 * starved the MC rejection path. */
146 gc, root_gate, event_gate, N);
147 if (direct) {
148 samples = std::move(*direct);
149 } else {
151 gc, root_gate, event_gate, N);
152 if (cs.accepted.empty())
154 "rv_histogram: conditional MC accepted 0 of %u samples; "
155 "raise provsql.rv_mc_samples or check that the event is "
156 "satisfiable",
157 cs.attempted);
158 samples = std::move(cs.accepted);
159 }
160 } else {
161 samples = provsql::monteCarloScalarSamples(gc, root_gate, N);
162 }
163
164 if (!samples.empty()) {
165 /* Pick the bin range per side: when @c compute_support proves
166 * a finite support endpoint we use it verbatim (Uniform / sums
167 * of Uniforms / mixtures of bounded RVs / Exponential's lower
168 * end at 0 / etc.), because the analytical bound is tighter
169 * than any sample-based estimate. When the support is open
170 * on a side (Normal, sums involving Normal, the upper tail of
171 * Exponential / Erlang) the raw empirical extreme would be
172 * an outlier draw -- ~±4σ for a Normal at rv_mc_samples = 10000
173 * -- which stretches the histogram so the bulk of the mass
174 * concentrates in middle bins and the edge bins look empty.
175 * Trim the outermost 0.1% of samples on that side; samples
176 * outside the trimmed range still get pooled into the edge
177 * bin (the clamp below), so total counts stay conserved. */
178 std::sort(samples.begin(), samples.end());
179 const std::size_t n = samples.size();
180 const std::size_t lo_idx = n / 1000; /* 0.1% */
181 const std::size_t hi_idx = n - 1 - lo_idx;
182 auto support = provsql::compute_support(gc, root_gate, event_opt);
183 double smin = std::isfinite(support.first)
184 ? support.first
185 : samples[lo_idx];
186 double smax = std::isfinite(support.second)
187 ? support.second
188 : samples[hi_idx];
189 if (smin == smax) {
190 /* Degenerate: trimmed range collapsed to a point (every
191 * non-tail draw is the same value, or the simplifier
192 * elided everything but a single point-mass leaf). */
193 emit_bin(out, first, smin, smax,
194 static_cast<unsigned>(samples.size()));
195 } else {
196 std::vector<unsigned> counts(bins, 0);
197 const double width = (smax - smin) / bins;
198 for (double x : samples) {
199 int b = static_cast<int>((x - smin) / width);
200 if (b < 0) b = 0;
201 if (b >= bins) b = bins - 1;
202 counts[b] += 1;
203 }
204 for (int i = 0; i < bins; ++i) {
205 const double lo = smin + i * width;
206 const double hi = (i == bins - 1) ? smax : lo + width;
207 emit_bin(out, first, lo, hi, counts[i]);
208 }
209 }
210 }
211 } else {
212 const char *type_name = (t < nb_gate_types)
213 ? gate_type_name[t] : "invalid";
215 "rv_histogram: root gate type '%s' is not a scalar "
216 "(expected gate_value, gate_rv, gate_arith, or gate_mixture)",
217 type_name);
218 }
219 } catch (const std::exception &e) {
220 provsql_error("rv_histogram: %s", e.what());
221 } catch (...) {
222 provsql_error("rv_histogram: unknown exception");
223 }
224
225 out << ']';
226 Datum json = DirectFunctionCall1(
227 jsonb_in, CStringGetDatum(pstrdup(out.str().c_str())));
228 PG_RETURN_DATUM(json);
229}
GenericCircuit getJointCircuit(pg_uuid_t root_token, pg_uuid_t event_token, gate_t &root_gate, gate_t &event_gate)
Build a GenericCircuit containing the closures of two roots, with shared subgraphs unified.
Build in-memory circuits from the mmap-backed persistent store.
gate_t
Strongly-typed gate identifier.
Definition Circuit.h:49
Semiring-agnostic in-memory provenance circuit.
Monte Carlo sampling over a GenericCircuit, RV-aware.
Continuous random-variable helpers (distribution parsing, moments).
Support-based bound check for continuous-RV comparators.
Datum rv_histogram(PG_FUNCTION_ARGS)
Exception type thrown by circuit operations on invalid input.
Definition Circuit.h:206
gateType getGateType(gate_t g) const
Return the type of gate g.
Definition Circuit.h:130
In-memory provenance circuit with semiring-generic evaluation.
std::string getExtra(gate_t g) const
Return the string extra for gate g.
std::pair< double, double > compute_support(const GenericCircuit &gc, gate_t root, std::optional< gate_t > event_root)
Compute the [lo, hi] support interval of a scalar sub-circuit rooted at root.
double parseDoubleStrict(const std::string &s)
Strictly parse s as a double.
ConditionalScalarSamples monteCarloConditionalScalarSamples(const GenericCircuit &gc, gate_t root, gate_t event_root, unsigned samples)
Rejection-sample root conditioned on event_root.
std::vector< double > monteCarloScalarSamples(const GenericCircuit &gc, gate_t root, unsigned samples)
Sample a scalar sub-circuit samples times and return the draws.
std::optional< std::vector< double > > try_truncated_closed_form_sample(const GenericCircuit &gc, gate_t root, gate_t event_root, unsigned n)
Try to draw n exact samples from the conditional distribution of root given event_root via closed-for...
int provsql_rv_mc_samples
Default sample count for analytical-evaluator MC fallbacks; 0 disables fallback (callers raise instea...
Definition provsql.c:82
Uniform error-reporting macros for ProvSQL.
#define provsql_error(fmt,...)
Report a fatal ProvSQL error and abort the current transaction.
const char * gate_type_name[]
Names of gate types.
Core types, constants, and utilities shared across ProvSQL.
@ gate_rv
Continuous random-variable leaf (extra encodes distribution).
@ gate_mixture
Probabilistic mixture: three wires [p_token (gate_input Bernoulli), x_token, y_token]; samples x when...
@ gate_arith
n-ary arithmetic gate over scalar-valued children (info1 holds operator tag)
C++ utility functions for UUID manipulation.
UUID structure.