ProvSQL C/C++ API
Adding support for provenance and uncertainty management to PostgreSQL databases
Loading...
Searching...
No Matches
StructuredDNNF.cpp
Go to the documentation of this file.
1/**
2 * @file StructuredDNNF.cpp
3 * @brief Implementation of the structured-d-DNNF builder (see StructuredDNNF.h).
4 *
5 * The §4 top-down construction: the monotone lineage is expanded to a canonical
6 * DNF and compiled into a ProvSQL @c dDNNF by decomposable AND at independence
7 * points and deterministic OR at Shannon decisions on the caller-supplied
8 * (Prop. 4.5) variable order, with a component cache sharing equal sub-d-DNNFs.
9 * Size is linear in the lineage under that order, where the generic methods
10 * (tree-decomposition, d4) blow up.
11 */
12#include "StructuredDNNF.h"
13
14#include <algorithm>
15#include <functional>
16#include <map>
17#include <set>
18#include <stdexcept>
19
20
21/* ======================================================================== *
22 * StructuredDNNFBuilder: the §4 top-down structured-d-DNNF construction *
23 * ======================================================================== */
24
25namespace {
26
27/* Hard caps so a pathological input degrades to "no decomposition" / a thrown
28 * size guard rather than running away. These bound work, not correctness. */
29constexpr std::size_t DNF_TERM_LIMIT = 8'000'000; // crossProduct blow-up
30constexpr std::size_t DECOMP_VAR_LIMIT = 1024; // skip pairwise above this
31/* Subsumption (O(T²)) and the decomposition cross-product check share this
32 * threshold: both run on residuals at or below it, neither above. Keeping them
33 * paired matters for soundness -- the cross-product verification assumes the
34 * term set is subsumption-free. Large residuals near the top are handled by
35 * Shannon decisions alone (correct, linear under the query order via caching). */
36constexpr std::size_t DECOMP_TERM_LIMIT = 512;
37
38inline bool contains(const std::vector<int> &term, int v)
39{
40 return std::binary_search(term.begin(), term.end(), v);
41}
42
43/* "unset" marker for the per-variable gate slots; distinct from any real gate
44 * id (which are dense from 0). */
45constexpr gate_t NO_GATE = gate_t{~std::size_t{0}};
46
47} // namespace
48
50{
51 return fs == o.fs && d == o.d;
52}
53
55{
56 /* order-sensitive fold; DNF keys are canonical so equal DNFs hash equal */
57 std::size_t h = 1469598103934665603ull; // FNV-ish basis
58 auto mix = [&](std::size_t x){ h ^= x + 0x9e3779b97f4a7c15ull + (h<<6) + (h>>2); };
59 for (const auto &t : k.d) {
60 for (int v : t) mix((std::size_t) (unsigned) v);
61 mix(0xffffffffull); // term separator
62 }
63 mix((std::size_t) k.fs);
64 return h;
65}
66
69{
70 /* sort + dedup variables within each term */
71 for (auto &t : d) {
72 std::sort(t.begin(), t.end());
73 t.erase(std::unique(t.begin(), t.end()), t.end());
74 }
75 /* an empty term is the constant TRUE: the whole DNF collapses */
76 for (const auto &t : d)
77 if (t.empty())
78 return DNF{ Term{} };
79
80 std::sort(d.begin(), d.end());
81 d.erase(std::unique(d.begin(), d.end()), d.end());
82
83 /* drop subsumed terms: if t' ⊇ t (some other term is a superset) t' is
84 * redundant in a disjunction. O(n²·k) in the term count, so only worth it on
85 * the small residuals where it sharpens decomposition; large residuals (near
86 * the top) are not decomposed anyway, and skipping keeps canonical O(n log n)
87 * there. Subsumption only ever removes terms, so skipping never changes the
88 * function. */
89 if (d.size() > DECOMP_TERM_LIMIT)
90 return d;
91 std::vector<char> dead(d.size(), 0);
92 for (std::size_t i = 0; i < d.size(); ++i) {
93 if (dead[i]) continue;
94 for (std::size_t j = 0; j < d.size(); ++j) {
95 if (i == j || dead[j] || d[i].size() > d[j].size())
96 continue;
97 bool subset = std::includes(d[j].begin(), d[j].end(),
98 d[i].begin(), d[i].end());
99 if (subset) /* d[i] ⊆ d[j]: d[j] is subsumed */
100 dead[j] = 1;
101 }
102 }
103 DNF out;
104 for (std::size_t i = 0; i < d.size(); ++i)
105 if (!dead[i])
106 out.push_back(std::move(d[i]));
107 return out;
108}
109
112{
113 DNF out;
114 out.reserve(d.size());
115 for (const auto &t : d) {
116 bool has = contains(t, v);
117 if (value) { /* v = true */
118 if (!has) { out.push_back(t); continue; }
119 Term nt; /* drop v; a now-empty term => TRUE */
120 nt.reserve(t.size() - 1);
121 for (int x : t) if (x != v) nt.push_back(x);
122 out.push_back(std::move(nt));
123 } else { /* v = false: terms needing v die */
124 if (!has) out.push_back(t);
125 }
126 }
127 return canonical(std::move(out));
128}
129
130std::vector<StructuredDNNFBuilder::DNF>
132{
133 /* variables present, ascending */
134 std::set<int> vset;
135 for (const auto &t : d) for (int v : t) vset.insert(v);
136 std::vector<int> vars(vset.begin(), vset.end());
137 const std::size_t V = vars.size(), T = d.size();
138 if (V <= 1 || V > DECOMP_VAR_LIMIT || T > DECOMP_TERM_LIMIT)
139 return { d };
140
141 std::map<int, int> idx; /* var -> 0..V-1 */
142 for (std::size_t i = 0; i < V; ++i) idx[vars[i]] = (int) i;
143
144 /* per-variable term count and pairwise co-occurrence count (n11) */
145 std::vector<int> cnt(V, 0);
146 std::map<std::pair<int,int>, int> co;
147 for (const auto &t : d) {
148 for (int v : t) cnt[(std::size_t) idx[v]]++;
149 for (std::size_t a = 0; a < t.size(); ++a)
150 for (std::size_t b = a + 1; b < t.size(); ++b) {
151 int u = idx[t[a]], w = idx[t[b]];
152 if (u > w) std::swap(u, w);
153 co[{u, w}]++;
154 }
155 }
156
157 /* union-find: variables u,w that are *not* independent must share a block.
158 * |proj_u| = 1 + (cnt[u] < T) (cnt[u] > 0 since u is present).
159 * Independent iff |proj_{u,w}| == |proj_u|·|proj_w|. */
160 std::vector<int> uf(V);
161 for (std::size_t i = 0; i < V; ++i) uf[i] = (int) i;
162 std::function<int(int)> find = [&](int x){ while (uf[x]!=x){ uf[x]=uf[uf[x]]; x=uf[x]; } return x; };
163 auto unite = [&](int a, int b){ a=find(a); b=find(b); if(a!=b) uf[a]=b; };
164
165 auto projcard = [&](int u){ return 1 + (cnt[(std::size_t)u] < (int) T ? 1 : 0); };
166 for (std::size_t a = 0; a < V; ++a)
167 for (std::size_t b = a + 1; b < V; ++b) {
168 int u = (int) a, w = (int) b;
169 int n11 = 0;
170 auto it = co.find({u, w});
171 if (it != co.end()) n11 = it->second;
172 int n10 = cnt[a] - n11, n01 = cnt[b] - n11;
173 int n00 = (int) T - cnt[a] - cnt[b] + n11;
174 int card = (n11>0) + (n10>0) + (n01>0) + (n00>0);
175 if (card != projcard(u) * projcard(w)) /* dependent */
176 unite(u, w);
177 }
178
179 /* gather blocks */
180 std::map<int, std::vector<int>> blocks; /* root -> vars (original ids) */
181 for (std::size_t i = 0; i < V; ++i)
182 blocks[find((int) i)].push_back(vars[i]);
183 if (blocks.size() <= 1)
184 return { d };
185
186 /* verify the exact cross-product: |T| == ∏_b |proj_b(d)|. Since every term
187 * projects into each block, T ⊆ ×proj always; equal cardinality ⟺ equality,
188 * so this makes the split sound (the AND node is genuinely decomposable). */
189 std::vector<std::set<int>> blockset; /* fast membership per block */
190 std::vector<std::vector<int>> blockvars;
191 for (auto &kv : blocks) {
192 blockset.emplace_back(kv.second.begin(), kv.second.end());
193 blockvars.push_back(kv.second);
194 }
195 std::vector<std::set<Term>> proj(blockset.size());
196 for (const auto &t : d)
197 for (std::size_t bi = 0; bi < blockset.size(); ++bi) {
198 Term p;
199 for (int v : t) if (blockset[bi].count(v)) p.push_back(v);
200 proj[bi].insert(std::move(p));
201 }
202 std::size_t prod = 1;
203 for (const auto &s : proj) prod *= s.size();
204 if (prod != T)
205 return { d }; /* not an exact product */
206
207 std::vector<DNF> parts;
208 for (auto &s : proj) {
209 DNF part(s.begin(), s.end());
210 parts.push_back(canonical(std::move(part)));
211 }
212 return parts;
213}
214
217 const std::map<gate_t, int> &rank,
218 std::map<gate_t, DNF> &memo) const
219{
220 auto mit = memo.find(g);
221 if (mit != memo.end()) return mit->second;
222
223 DNF res;
224 switch (bc.getGateType(g)) {
225 case BooleanGate::IN: {
226 auto rit = rank.find(g);
227 if (rit == rank.end())
228 throw CircuitException("StructuredDNNFBuilder: input gate has no rank "
229 "in the supplied variable order");
230 res = DNF{ Term{ rit->second } };
231 break;
232 }
233 case BooleanGate::AND: {
234 res = DNF{ Term{} }; /* neutral element: TRUE */
235 for (gate_t c : bc.getWires(g)) {
236 DNF cd = extract(bc, c, rank, memo);
237 DNF prod;
238 if (res.size() * cd.size() > DNF_TERM_LIMIT)
239 throw CircuitException("StructuredDNNFBuilder: DNF expansion exceeds "
240 "size guard");
241 prod.reserve(res.size() * cd.size());
242 for (const auto &ta : res)
243 for (const auto &tb : cd) {
244 Term t = ta;
245 t.insert(t.end(), tb.begin(), tb.end());
246 prod.push_back(std::move(t));
247 }
248 res = canonical(std::move(prod));
249 }
250 break;
251 }
252 case BooleanGate::OR: {
253 for (gate_t c : bc.getWires(g)) {
254 DNF cd = extract(bc, c, rank, memo);
255 for (auto &t : cd) res.push_back(std::move(t));
256 if (res.size() > DNF_TERM_LIMIT)
257 throw CircuitException("StructuredDNNFBuilder: DNF expansion exceeds "
258 "size guard");
259 }
260 res = canonical(std::move(res));
261 break;
262 }
263 default:
264 throw CircuitException("StructuredDNNFBuilder: unsupported gate type "
265 "(NOT, multivalued input or non-Boolean gate)");
266 }
267 memo.emplace(g, res);
268 return res;
269}
270
272{
273 gate_t g = dd_.setGate(type);
274 if (max_nodes_ && dd_.getNbGates() > max_nodes_)
275 throw CircuitException("StructuredDNNFBuilder: size guard exceeded");
276 return g;
277}
278
280{
281 if (in_gate_[(std::size_t) v] == NO_GATE)
282 in_gate_[(std::size_t) v] = uuid_[(std::size_t) v].empty()
283 ? dd_.setGate(BooleanGate::IN, prob_[(std::size_t) v])
284 : dd_.setGate(uuid_[(std::size_t) v], BooleanGate::IN, prob_[(std::size_t) v]);
285 return in_gate_[(std::size_t) v];
286}
287
289{
290 if (not_gate_[(std::size_t) v] == NO_GATE) {
292 dd_.addWire(n, mkLit(v));
293 not_gate_[(std::size_t) v] = n;
294 }
295 return not_gate_[(std::size_t) v];
296}
297
298gate_t StructuredDNNFBuilder::mkAnd(const std::vector<gate_t> &children)
299{
300 std::vector<gate_t> real;
301 for (gate_t c : children) {
302 if (c == false_gate_) return false_gate_; /* absorbing */
303 if (c == true_gate_) continue; /* neutral */
304 real.push_back(c);
305 }
306 if (real.empty()) return true_gate_;
307 if (real.size() == 1) return real[0];
309 for (gate_t c : real) dd_.addWire(g, c);
310 return g;
311}
312
313std::vector<StructuredDNNFBuilder::DNF>
315{
316 /* variables present, with a dense index for union-find */
317 std::set<int> vset;
318 for (const auto &t : d) for (int v : t) vset.insert(v);
319 if (vset.size() <= 1) return { d };
320 std::vector<int> vars(vset.begin(), vset.end());
321 std::map<int,int> idx;
322 for (std::size_t i = 0; i < vars.size(); ++i) idx[vars[i]] = (int) i;
323
324 std::vector<int> uf(vars.size());
325 for (std::size_t i = 0; i < uf.size(); ++i) uf[i] = (int) i;
326 std::function<int(int)> find = [&](int x){ while (uf[x]!=x){ uf[x]=uf[uf[x]]; x=uf[x]; } return x; };
327 /* variables co-occurring in a term are connected (the disjunct couples them) */
328 for (const auto &t : d)
329 for (std::size_t a = 1; a < t.size(); ++a) {
330 int r0 = find(idx[t[0]]), ra = find(idx[t[a]]);
331 if (r0 != ra) uf[r0] = ra;
332 }
333
334 /* group terms by their component (all of a term's vars share one root) */
335 std::map<int, DNF> comp;
336 for (const auto &t : d) comp[find(idx[t[0]])].push_back(t);
337 if (comp.size() <= 1) return { d };
338
339 /* order components by least variable, so the chain follows the order */
340 std::vector<std::pair<int,DNF>> ordered;
341 for (auto &kv : comp) {
342 int mn = INT_MAX;
343 for (const auto &t : kv.second) mn = std::min(mn, t.front());
344 ordered.push_back({ mn, std::move(kv.second) });
345 }
346 std::sort(ordered.begin(), ordered.end(),
347 [](const auto &a, const auto &b){ return a.first < b.first; });
348 std::vector<DNF> out;
349 out.reserve(ordered.size());
350 for (auto &pr : ordered) out.push_back(std::move(pr.second));
351 return out;
352}
353
355{
356 DNF d = canonical(draw);
357 if (d.empty()) return false_sink;
358 if (d.size() == 1 && d[0].empty()) return true_gate_;
359
360 CacheKey key{ d, false_sink };
361 auto cit = cache_.find(key);
362 if (cit != cache_.end()) return cit->second;
363
364 gate_t res;
365
366 /* (1) OR-chain over variable-disjoint components. d = c0 ∨ c1 ∨ … ∨ sink, the
367 * cᵢ ordered by least variable. Build right to left: each component's FALSE
368 * leaf is the node for "everything later", so an inert component is one shared
369 * node, never copied into the residual DNF. */
370 std::vector<DNF> comps = orDecompose(d);
371 if (comps.size() > 1) {
372 gate_t tail = false_sink;
373 for (auto it = comps.rbegin(); it != comps.rend(); ++it)
374 tail = build(*it, tail);
375 res = tail;
376 } else {
377 /* (2) decomposable AND: residual is a product of variable-disjoint factors.
378 * Only when there is no pending OR tail (false_sink is the global FALSE):
379 * with a tail, the function is (∏ factors) ∨ tail, which is not a clean
380 * decomposable AND. This also splits a single multi-variable term into a
381 * flat AND of literal leaves. */
382 std::vector<DNF> parts = (false_sink == false_gate_)
383 ? andDecompose(d) : std::vector<DNF>{ d };
384 if (parts.size() > 1) {
385 std::vector<gate_t> ch;
386 ch.reserve(parts.size());
387 for (const auto &p : parts) ch.push_back(build(p, false_gate_));
388 res = mkAnd(ch);
389 } else {
390 /* (3) deterministic OR: Shannon-decide the lowest-rank variable present. */
391 Var v = d[0][0];
392 for (const auto &t : d) if (t[0] < v) v = t[0];
393 gate_t ghi = build(condition(d, v, true), false_sink);
394 gate_t glo = build(condition(d, v, false), false_sink);
395 if (ghi == glo) {
396 res = ghi;
397 } else {
398 gate_t andHi = mkAnd({ mkLit(v), ghi });
399 gate_t andLo = mkAnd({ mkNeg(v), glo });
401 if (andHi != false_gate_) dd_.addWire(orG, andHi);
402 if (andLo != false_gate_) dd_.addWire(orG, andLo);
403 res = orG;
404 }
405 }
406 }
407
408 cache_.emplace(CacheKey{ std::move(d), false_sink }, res);
409 return res;
410}
411
413 gate_t root,
414 const std::map<gate_t, int> &input_rank,
415 std::size_t max_nodes)
416 : max_nodes_(max_nodes)
417{
418 if (bc.hasMultivaluedGates())
419 throw CircuitException("StructuredDNNFBuilder: multivalued inputs (BID) are "
420 "out of scope for the inversion-free path");
421
422 std::size_t ninputs = input_rank.size();
423 prob_.assign(ninputs, 0.0);
424 uuid_.assign(ninputs, std::string());
425 in_gate_.assign(ninputs, NO_GATE);
426 not_gate_.assign(ninputs, NO_GATE);
427 for (const auto &kv : input_rank) {
428 if (kv.second < 0 || (std::size_t) kv.second >= ninputs)
429 throw CircuitException("StructuredDNNFBuilder: input rank out of range");
430 prob_[(std::size_t) kv.second] = bc.getProb(kv.first);
431 // Carry the source UUID onto each rank so the built d-DNNF's input leaves
432 // are identifiable (DOT labels, NNF variable mapping), like every other
433 // d-DNNF builder; the structured recursion itself only uses the rank.
434 uuid_[(std::size_t) kv.second] = bc.getUUID(kv.first);
435 }
436
437 true_gate_ = dd_.setGate(BooleanGate::AND); /* empty AND = TRUE */
438 false_gate_ = dd_.setGate(BooleanGate::OR); /* empty OR = FALSE */
439
440 std::map<gate_t, DNF> memo;
441 DNF top = extract(bc, root, input_rank, memo);
442
443 root_ = build(top, false_gate_);
444 dd_.root = root_;
445 dd_.simplify();
446}
447
448
450{
451 std::set<gate_t> seen;
452 std::vector<gate_t> stack{ dd_.root };
453 while (!stack.empty()) {
454 gate_t g = stack.back(); stack.pop_back();
455 if (!seen.insert(g).second) continue;
456 for (gate_t c : dd_.getWires(g)) stack.push_back(c);
457 }
458 return seen.size();
459}
BooleanGate
Gate types for a Boolean provenance circuit.
@ NOT
Logical negation of a single child gate.
@ OR
Logical disjunction of child gates.
@ AND
Logical conjunction of child gates.
@ IN
Input (variable) gate representing a base tuple.
gate_t
Strongly-typed gate identifier.
Definition Circuit.h:49
In-process structured-d-DNNF construction over a query-derived variable order, for the inversion-free...
Boolean circuit for provenance formula evaluation.
bool hasMultivaluedGates() const
Return true if the circuit contains any MULIN gates.
double getProb(gate_t g) const
Return the probability stored for gate g.
Exception type thrown by circuit operations on invalid input.
Definition Circuit.h:206
std::vector< gate_t > & getWires(gate_t g)
Return a mutable reference to the child-wire list of gate g.
Definition Circuit.h:140
gateType getGateType(gate_t g) const
Return the type of gate g.
Definition Circuit.h:130
uuid getUUID(gate_t g) const
Return the UUID string associated with gate g.
Definition Circuit.hpp:46
gate_t mkNeg(Var v)
shared negative literal NOT(IN)
std::vector< DNF > andDecompose(const DNF &d) const
static DNF canonical(DNF d)
sort, dedup, drop supersets
StructuredDNNFBuilder(const BooleanCircuit &bc, gate_t root, const std::map< gate_t, int > &input_rank, std::size_t max_nodes=0)
DNF extract(const BooleanCircuit &bc, gate_t g, const std::map< gate_t, int > &rank, std::map< gate_t, DNF > &memo) const
std::size_t size() const
d-DNNF gates reachable from the root.
std::vector< std::string > uuid_
rank -> source input UUID (for labels)
std::vector< gate_t > in_gate_
rank -> shared dDNNF IN gate
std::unordered_map< CacheKey, gate_t, CacheKeyHash > cache_
std::vector< gate_t > not_gate_
rank -> shared dDNNF NOT(IN) gate (lazy)
gate_t mkAnd(const std::vector< gate_t > &children)
static DNF condition(const DNF &d, Var v, bool value)
std::vector< double > prob_
prob_[rank] = P(variable)
std::vector< DNF > orDecompose(const DNF &d) const
gate_t false_gate_
empty AND / empty OR terminals
gate_t mkLit(Var v)
shared positive literal IN
gate_t build(const DNF &d, gate_t false_sink)
std::vector< Term > DNF
A sum of products: canonicalised.
std::vector< Var > Term
A product: sorted, duplicate-free vars.
gate_t newGate(BooleanGate type)
setGate + size guard
int Var
Variable rank in [0,ninputs).
std::size_t operator()(const CacheKey &k) const
bool operator==(const CacheKey &o) const