doxygen-c/html/AnalyticEvaluator_8cpp_source.html

/**

 * @file AnalyticEvaluator.cpp

 * @brief Implementation of the closed-form CDF resolution pass.

 *        See @c AnalyticEvaluator.h for the full docstring.

 */

#include "AnalyticEvaluator.h"


#include <cmath>

#include <limits>

#include <optional>

#include <vector>


#include "Aggregation.h"        // ComparisonOperator + cmpOpFromOid

#include "RandomVariable.h"     // parse_distribution_spec, parseDoubleStrict, DistKind

extern "C" {

#include "provsql_utils.h"      // gate_type

}


namespace provsql {


double pdfAt(const DistributionSpec &d, double c)

{

  double pdf_c = std::numeric_limits<double>::quiet_NaN();

  switch (d.kind) {

    case DistKind::Normal: {

      /* f(c) = (1 / (σ √(2π))) · exp(-(c-μ)² / (2σ²)).  Numerically

       * stable for any finite c; for c far in the tail the exp

       * underflows to 0 cleanly. */

      const double mu = d.p1, sigma = d.p2;

      if (!(sigma > 0.0)) break;

      static const double SQRT_2PI = std::sqrt(2.0 * M_PI);

      const double z = (c - mu) / sigma;

      pdf_c = std::exp(-0.5 * z * z) / (sigma * SQRT_2PI);

      break;

    }

    case DistKind::Uniform: {

      const double a = d.p1, b = d.p2;

      if (!(b > a)) break;

      pdf_c = (c < a || c > b) ? 0.0 : 1.0 / (b - a);

      break;

    }

    case DistKind::Exponential: {

      const double lambda = d.p1;

      if (!(lambda > 0.0)) break;

      pdf_c = (c < 0.0) ? 0.0 : lambda * std::exp(-lambda * c);

      break;

    }

    case DistKind::Erlang: {

      /* f(c; k, λ) = λ^k · c^(k-1) · e^(-λc) / (k-1)! for c >= 0.

       * Same integer-k caveat as the CDF: non-integer shapes need

       * the regularised lower incomplete gamma and are out of scope. */

      const double s = d.p1, lambda = d.p2;

      if (s < 1.0 || s != std::floor(s) || !(lambda > 0.0)) break;

      if (c < 0.0) { pdf_c = 0.0; break; }

      const unsigned long k = static_cast<unsigned long>(s);

      /* (k-1)! is small for the typical Erlang shapes (k <= ~20);

       * compute incrementally to keep precision. */

      double fact = 1.0;

      for (unsigned long i = 2; i < k; ++i) fact *= static_cast<double>(i);

      pdf_c = std::pow(lambda, static_cast<double>(k))

            * std::pow(c, static_cast<double>(k - 1))

            * std::exp(-lambda * c)

            / fact;

      break;

    }

  }

  return pdf_c;

}


double cdfAt(const DistributionSpec &d, double c)

{

  double cdf_c = std::numeric_limits<double>::quiet_NaN();

  switch (d.kind) {

    case DistKind::Normal: {

      /* Φ((c - μ)/σ) = ½ (1 + erf((c - μ) / (σ √2))).  Standard

       * formula; std::erf is C99 / C++11. */

      static const double SQRT2 = std::sqrt(2.0);

      double z = (c - d.p1) / d.p2;

      cdf_c = 0.5 * (1.0 + std::erf(z / SQRT2));

      break;

    }

    case DistKind::Uniform:

      if (c <= d.p1)        cdf_c = 0.0;

      else if (c >= d.p2)   cdf_c = 1.0;

      else                  cdf_c = (c - d.p1) / (d.p2 - d.p1);

      break;

    case DistKind::Exponential:

      if (c <= 0.0) cdf_c = 0.0;

      else          cdf_c = -std::expm1(-d.p1 * c);  /* 1 - exp(-λc) */

      break;

    case DistKind::Erlang: {

      /* For integer shape s ≥ 1, the Erlang CDF has the finite-sum

       * form F(c; s, λ) = 1 - e^{-λc} Σ_{n=0..s-1} (λc)^n / n!.

       * Numerically stable for the small-to-moderate s the simplifier

       * produces (a SUM of k i.i.d. Exp gates).  Non-integer s would

       * require the regularised lower incomplete gamma function, so

       * we skip those cases by leaving cdf_c = NaN (caller treats NaN

       * as "undecided" and the cmp falls through to MC). */

      const double s = d.p1, lambda = d.p2;

      if (s < 1.0 || s != std::floor(s)) break;

      if (c <= 0.0) { cdf_c = 0.0; break; }

      const double lc = lambda * c;

      double term = 1.0;   /* (λc)^0 / 0! */

      double sum  = 1.0;

      const unsigned long k = static_cast<unsigned long>(s);

      for (unsigned long n = 1; n < k; ++n) {

        term *= lc / static_cast<double>(n);

        sum  += term;

      }

      cdf_c = 1.0 - std::exp(-lc) * sum;

      break;

    }

  }

  return cdf_c;

}


namespace {


/* All four ordered comparators reduce to either F(c) or 1 - F(c)

 * (continuous: @c < and @c <= have the same probability, ditto @c >

 * and @c >=).  EQ / NE on continuous RVs are handled universally by

 * RangeCheck (P(X = c) = 0, P(X != c) = 1, sound in every semiring

 * via gate_zero / gate_one); they should never reach this function. */

double cdfDecide(const DistributionSpec &d, ComparisonOperator op, double c)

{

  double cdf_c = cdfAt(d, c);

  if (std::isnan(cdf_c)) return cdf_c;


  switch (op) {

    case ComparisonOperator::LT:

    case ComparisonOperator::LE:

      return cdf_c;

    case ComparisonOperator::GT:

    case ComparisonOperator::GE:

      return 1.0 - cdf_c;

    case ComparisonOperator::EQ:

    case ComparisonOperator::NE:

      /* Should have been handled upstream by RangeCheck; if we still

       * see one here it means RangeCheck did not run (e.g.

       * provsql.simplify_on_load is off).  Fall through to undecided

       * rather than silently make an inconsistent choice. */

      return std::numeric_limits<double>::quiet_NaN();

  }

  return std::numeric_limits<double>::quiet_NaN();

}


/* Mirror @c provsql_having_detail::flip_op without taking the

 * dependency on @c having_semantics from this file.  Used to

 * normalise @c c @c cmp @c X into @c X @c flip(cmp) @c c. */

ComparisonOperator flipCmpOp(ComparisonOperator op)

{

  switch (op) {

    case ComparisonOperator::LT: return ComparisonOperator::GT;

    case ComparisonOperator::LE: return ComparisonOperator::GE;

    case ComparisonOperator::GT: return ComparisonOperator::LT;

    case ComparisonOperator::GE: return ComparisonOperator::LE;

    case ComparisonOperator::EQ: return ComparisonOperator::EQ;

    case ComparisonOperator::NE: return ComparisonOperator::NE;

  }

  return op;

}


/* X cmp Y for X, Y independent normals.  Reduces to (X - Y) cmp 0

 * with X - Y ~ N(μ_X - μ_Y, σ_X² + σ_Y²). */

double normalDiffDecide(const DistributionSpec &X,

                        const DistributionSpec &Y,

                        ComparisonOperator op)

{

  DistributionSpec diff;

  diff.kind = DistKind::Normal;

  diff.p1 = X.p1 - Y.p1;

  diff.p2 = std::sqrt(X.p2 * X.p2 + Y.p2 * Y.p2);

  return cdfDecide(diff, op, 0.0);

}


/* Try to parse a @c gate_value's extra as a double.  Returns NaN on

 * any failure (caller treats NaN as "skip this cmp"). */

double bareValue(const GenericCircuit &gc, gate_t g)

{

  if (gc.getGateType(g) != gate_value)

    return std::numeric_limits<double>::quiet_NaN();

  try { return parseDoubleStrict(gc.getExtra(g)); }

  catch (const CircuitException &) {

    return std::numeric_limits<double>::quiet_NaN();

  }

}


/* Try to parse a @c gate_rv's distribution spec.  Returns @c

 * std::nullopt on any failure. */

std::optional<DistributionSpec>

bareRv(const GenericCircuit &gc, gate_t g)

{

  if (gc.getGateType(g) != gate_rv)

    return std::nullopt;

  return parse_distribution_spec(gc.getExtra(g));

}


/* Closed-form P(X cmp c) for a categorical-form gate_mixture X.  X's

 * wires are [key, mul_1, ..., mul_n]; each mul_i carries its

 * probability in set_prob and its outcome value in extra (parsed as

 * float8).  The probability is just the sum of π_i over mulinputs

 * whose value satisfies the predicate.

 *

 * EQ / NE are exact too in this setting (X = c iff some outcome equals

 * c with positive mass): the RangeCheck pre-pass treats EQ / NE over

 * continuous RVs as P=0 / P=1, but a categorical is discrete so we

 * decide them here.  Returns NaN if any mulinput's extra fails to

 * parse as a finite float8 -- the cmp then falls through to MC. */

double categoricalDecide(const GenericCircuit &gc, gate_t mix,

                         ComparisonOperator op, double c)

{

  const auto &wires = gc.getWires(mix);

  double p = 0.0;

  for (std::size_t i = 1; i < wires.size(); ++i) {

    double v;

    try { v = parseDoubleStrict(gc.getExtra(wires[i])); }

    catch (const CircuitException &) {

      return std::numeric_limits<double>::quiet_NaN();

    }

    bool hit = false;

    switch (op) {

      case ComparisonOperator::LT: hit = v <  c; break;

      case ComparisonOperator::LE: hit = v <= c; break;

      case ComparisonOperator::GT: hit = v >  c; break;

      case ComparisonOperator::GE: hit = v >= c; break;

      case ComparisonOperator::EQ: hit = v == c; break;

      case ComparisonOperator::NE: hit = v != c; break;

    }

    if (hit) p += gc.getProb(wires[i]);

  }

  return p;

}


/**

 * @brief Try to decide @p cmp_gate via a closed-form CDF.

 *

 * Recognised shapes:

 * - @c X @c cmp @c c (X a bare @c gate_rv, c a bare @c gate_value)

 * - @c c @c cmp @c X (mirror of the above; flip the comparator)

 * - @c X @c cmp @c Y where both @c X and @c Y are bare normal

 *   @c gate_rv leaves with distinct UUIDs (independence test)

 *

 * Returns the analytical probability in [0, 1] when decided,

 * @c NaN otherwise.

 */

double tryAnalyticDecide(const GenericCircuit &gc, gate_t cmp_gate)

{

  bool ok = false;

  ComparisonOperator op = cmpOpFromOid(gc.getInfos(cmp_gate).first, ok);

  if (!ok) return std::numeric_limits<double>::quiet_NaN();


  const auto &wires = gc.getWires(cmp_gate);

  if (wires.size() != 2) return std::numeric_limits<double>::quiet_NaN();

  gate_t lhs = wires[0], rhs = wires[1];


  /* X cmp c */

  if (auto specX = bareRv(gc, lhs)) {

    double c = bareValue(gc, rhs);

    if (!std::isnan(c)) return cdfDecide(*specX, op, c);

  }


  /* c cmp X */

  if (auto specX = bareRv(gc, rhs)) {

    double c = bareValue(gc, lhs);

    if (!std::isnan(c)) return cdfDecide(*specX, flipCmpOp(op), c);

  }


  /* Categorical mixture cmp constant: exact sum of mass over the

   * mulinputs whose value satisfies the predicate.  EQ / NE are

   * meaningful on a discrete distribution and decided here rather

   * than the continuous-default route RangeCheck takes. */

  if (gc.isCategoricalMixture(lhs)) {

    double c = bareValue(gc, rhs);

    if (!std::isnan(c)) return categoricalDecide(gc, lhs, op, c);

  }

  if (gc.isCategoricalMixture(rhs)) {

    double c = bareValue(gc, lhs);

    if (!std::isnan(c)) return categoricalDecide(gc, rhs, flipCmpOp(op), c);

  }


  /* X cmp Y, both bare normal RVs.  The @c X cmp X same-UUID case

   * is handled upstream by RangeCheck's identity shortcut, so by the

   * time we get here distinct UUIDs implies independence (each

   * @c provsql.normal call mints a fresh @c uuid_generate_v4 token). */

  {

    auto specX = bareRv(gc, lhs);

    auto specY = bareRv(gc, rhs);

    if (specX && specY &&

        specX->kind == DistKind::Normal &&

        specY->kind == DistKind::Normal) {

      return normalDiffDecide(*specX, *specY, op);

    }

  }


  return std::numeric_limits<double>::quiet_NaN();

}


}  // namespace


unsigned runAnalyticEvaluator(GenericCircuit &gc)

{

  unsigned resolved = 0;

  const auto nb = gc.getNbGates();


  /* Snapshot the cmp-gate ids so in-place rewrites don't affect the

   * iteration: same pattern as @c runRangeCheck. */

  std::vector<gate_t> cmps;

  for (std::size_t i = 0; i < nb; ++i) {

    auto g = static_cast<gate_t>(i);

    if (gc.getGateType(g) == gate_cmp)

      cmps.push_back(g);

  }


  for (gate_t c : cmps) {

    if (gc.getGateType(c) != gate_cmp) continue;

    double p = tryAnalyticDecide(gc, c);

    if (!std::isnan(p)) {

      /* Clamp to [0, 1] defensively: floating-point CDF roundoff

       * could in principle produce values marginally outside the

       * unit interval (1 - F(c) for c far in the right tail). */

      if (p < 0.0) p = 0.0;

      if (p > 1.0) p = 1.0;

      gc.resolveCmpToBernoulli(c, p);

      ++resolved;

    }

  }


  return resolved;

}


}  // namespace provsql


cmpOpFromOid
ComparisonOperator cmpOpFromOid(Oid op_oid, bool &ok)
Map a PostgreSQL comparison-operator OID to a ComparisonOperator.
Definition Aggregation.cpp:66

Aggregation.h
Typed aggregation value, operator, and aggregator abstractions.

ComparisonOperator
ComparisonOperator
SQL comparison operators used in gate_cmp circuit gates.
Definition Aggregation.h:38

ComparisonOperator::EQ
@ EQ
Equal (=).
Definition Aggregation.h:39

ComparisonOperator::LT
@ LT
Less than (<).
Definition Aggregation.h:42

ComparisonOperator::GT
@ GT
Greater than (>).
Definition Aggregation.h:44

ComparisonOperator::LE
@ LE
Less than or equal (<=).
Definition Aggregation.h:41

ComparisonOperator::NE
@ NE
Not equal (<>).
Definition Aggregation.h:40

ComparisonOperator::GE
@ GE
Greater than or equal (>=).
Definition Aggregation.h:43

AnalyticEvaluator.h
Closed-form CDF resolution for trivial gate_cmp shapes.

gate_t
gate_t
Strongly-typed gate identifier.
Definition Circuit.h:49

RandomVariable.h
Continuous random-variable helpers (distribution parsing, moments).

Circuit::getWires
std::vector< gate_t > & getWires(gate_t g)
Return a mutable reference to the child-wire list of gate g.
Definition Circuit.h:140

Circuit::getGateType
gateType getGateType(gate_t g) const
Return the type of gate g.
Definition Circuit.h:130

Circuit::getNbGates
std::vector< gate_t >::size_type getNbGates() const
Return the total number of gates in the circuit.
Definition Circuit.h:103

GenericCircuit
In-memory provenance circuit with semiring-generic evaluation.
Definition GenericCircuit.h:49

GenericCircuit::isCategoricalMixture
bool isCategoricalMixture(gate_t g) const
Test whether g is a categorical-form gate_mixture (the explicit provsql.categorical output).
Definition GenericCircuit.h:516

GenericCircuit::getExtra
std::string getExtra(gate_t g) const
Return the string extra for gate g.
Definition GenericCircuit.h:106

GenericCircuit::getProb
double getProb(gate_t g) const
Return the probability for gate g.
Definition GenericCircuit.h:144

GenericCircuit::resolveCmpToBernoulli
void resolveCmpToBernoulli(gate_t g, double p)
Replace a gate_cmp by a constant Boolean leaf (gate_one for p == 1, gate_zero for p == 0) or by a Ber...
Definition GenericCircuit.h:176

GenericCircuit::getInfos
std::pair< unsigned, unsigned > getInfos(gate_t g) const
Return the integer annotation pair for gate g.
Definition GenericCircuit.h:83

provsql
Definition AnalyticEvaluator.cpp:19

provsql::DistKind::Normal
@ Normal
Normal (Gaussian): p1=μ, p2=σ
Definition RandomVariable.h:29

provsql::DistKind::Exponential
@ Exponential
Exponential: p1=λ, p2 unused.
Definition RandomVariable.h:31

provsql::DistKind::Uniform
@ Uniform
Uniform on [a,b]: p1=a, p2=b.
Definition RandomVariable.h:30

provsql::DistKind::Erlang
@ Erlang
Erlang: p1=k (positive integer), p2=λ.
Definition RandomVariable.h:32

provsql::parseDoubleStrict
double parseDoubleStrict(const std::string &s)
Strictly parse s as a double.
Definition RandomVariable.cpp:17

provsql::pdfAt
double pdfAt(const DistributionSpec &d, double c)
Closed-form probability density  for a basic distribution.
Definition AnalyticEvaluator.cpp:21

provsql::parse_distribution_spec
std::optional< DistributionSpec > parse_distribution_spec(const std::string &s)
Parse the on-disk text encoding of a gate_rv distribution.
Definition RandomVariable.cpp:59

provsql::cdfAt
double cdfAt(const DistributionSpec &d, double c)
Closed-form CDF  for a basic continuous distribution.
Definition AnalyticEvaluator.cpp:70

provsql::runAnalyticEvaluator
unsigned runAnalyticEvaluator(GenericCircuit &gc)
Run the closed-form CDF resolution pass over gc.
Definition AnalyticEvaluator.cpp:300

gate_value
@ gate_value
Definition provsql_migrate_mmap.cpp:74

gate_cmp
@ gate_cmp
Definition provsql_migrate_mmap.cpp:74

provsql_utils.h
Core types, constants, and utilities shared across ProvSQL.

gate_rv
@ gate_rv
Continuous random-variable leaf (extra encodes distribution).
Definition provsql_utils.h:71

provsql::DistributionSpec
Parsed distribution spec (kind + up to two parameters).
Definition RandomVariable.h:46

provsql::DistributionSpec::kind
DistKind kind
Definition RandomVariable.h:47

provsql::DistributionSpec::p2
double p2
Second parameter (σ or b; unused for Exponential).
Definition RandomVariable.h:49

provsql::DistributionSpec::p1
double p1
First parameter (μ, a, or λ).
Definition RandomVariable.h:48