probast_draf.tex
14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
\documentclass[bigger]{beamer}
\useinnertheme{circles}
\usefonttheme[onlymath]{serif}
\usefonttheme{structurebold}
\setbeamertemplate{navigation symbols}{}
\usepackage{xcolor}
\setbeamercolor{highlight block}{bg=gray}
\usepackage{tikz}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[normalem]{ulem} % To strikeout
\usepackage{commath}
\newcommand{\naf}{\ensuremath{\sim\!\!}}
\title{Probabilistic Answer Set Programming}
\subtitle{A Research Draft}
\author{Francisco Coelho}
\institute[\texttt{fc@uevora.pt}]{
NOVA LINCS \&\\
High Performance Computing Chair \&\\
Departamento de Informática, Universidade de Évora
}
\begin{document}
\begin{frame}[plain]
\titlepage
\end{frame}
\section*{Motivation}
\begin{frame}
\frametitle{In short\ldots\hfill\small \ldots a word wall. I'm sorry.}
\begin{itemize}
\item \textbf{Machine Learning} has important limitations:
\begin{itemize}
\item The \emph{one table, conditionally independent rows} assumption.
\item \emph{Background knowledge} is hard to include.
\item \emph{Training} requires ``large'' amounts of data.
\item \emph{Models} are hard do interpret.
\end{itemize}
\item \textbf{Inductive Logic Programming} is based on first order logic --- solves all the problems above but is sensible to \emph{noise}.
\item \textbf{Distribution Semantics} defines the probability of a proposition from probabilities of the (marginally independent) facts.
\item \textbf{Answer Set Programs} resets the common syntax and semantic of logic programs; A ``program'' defines \emph{stable models}, not a computation neither a variable substitution.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{\xout{Goals} Wish list}
\begin{block}{Extend distribution semantics to answer sets}
\begin{itemize}
\item Within a theoretical framework.
\item Computationally applicable to ``real world'' scenarios.
\item Easy to include background knowledge.
\item Perform common tasks such as \emph{marg, mle, map, etc.}
\item Learn program ``parameters'' and ``structure'' from \emph{noisy samples} --- possibly using \emph{templates}.
\item Related to Bayesian Networks, HMMs, \emph{etc.}
\end{itemize}
\end{block}
\end{frame}
\section{Development}
\begin{frame}
\tableofcontents[currentsection]
\end{frame}
\begin{frame}
\frametitle{The seed on an idea}
We want to define the \textbf{joint distribution} of the stable models.
\begin{enumerate}
\item A \textbf{boolean random variable} can be described by a disjunction $a; \neg a$.
\item This ASP program has two stable models: $a$ and $\neg a$.
\item A program with $n$ such facts $a_i; \neg a_i$ has $2^n$ stable models, the distinct combinations of those choices.
\item \textbf{If each $a_i$ has probability $p_i$ then the probability of a stable model $W$ would be} $$P(W) = \prod_{a_i \in W}p_i \prod_{\neg a_i \in W} (1 - p_i).$$
\end{enumerate}
\pause
\begin{alertblock}{But this is wrong.}
Even assuming that those facts are marginally independent --- which we will do.
\end{alertblock}
\end{frame}
\begin{frame}{Problem 1: Disjuntive Clauses}
The ASP program with probabilistic facts
$$
\begin{aligned}
&b \vee \neg b\\
&h_1 \vee h_2 \leftarrow b
\end{aligned}
$$
has \textbf{three} stable models: $\set{\neg b}, \set{b, h_1}$ and $\set{b, h_2}$.
\begin{block}{How to assign a probability to each model?}
\pause
Possible approaches:
\begin{enumerate}
\item Pre-assign a \textbf{conditional distribution of the head}:
$$P(h_1, h_2 | b).$$
\item Bayesian learn from \textbf{observations}:
$$P(h_1, h_2 | b,z) \propto P(b, z | h_1, h_2) P(h_1, h_2).$$
\item Start with the former as \textbf{prior} and \textbf{update} with the latter.
\end{enumerate}
\end{block}
\end{frame}
\begin{frame}
\frametitle{Questions to address}
\begin{itemize}
\item How to \textbf{match} an observation $z$ with a clause case $h,b$?
\item How do observations \textbf{update} the probabilities?
\item Why match observations with clauses and \textbf{not with stable models}?
\item Is this just \textbf{bayesian networking}?
\item How to frame this in a \textbf{sound theoretic setting}?
\item Is this enough to compute the \textbf{joint distribution of the atoms}?
\end{itemize}
\onslide<2->
\begin{exampleblock}{Counters}
Instead of setting and updating probabilities, we associate \textbf{counters} to disjunctive clauses and their cases.
\end{exampleblock}
\end{frame}
\begin{frame}
\frametitle{Bayesian updates: Matching observations}
\begin{itemize}
\item An \alert{observation} is a subset of the literals from a program\footnote{The set of atoms, $a$, of the program and their classic negations, $\neg a$.}.
\item A \alert{consistent} observation has no subset $\set{p, \neg p}$.
\item A \textbf{consistent} observation $z$ is \alert{relevant} for the clause $h \leftarrow b$ if $b \subseteq z$.
\item A disjunctive clause $$h_1 \vee \cdots \vee h_n \leftarrow b_1 \wedge \cdots \wedge b_m$$ has $n$ \alert{cases}: $\set{h_i, b_1, \ldots, b_m}, i = 1:n$.
\item The \textbf{consistent} observation $z$ \alert{matches} the case $\set{h, b_\ast}$ if
$\set{h, b_\ast} \subseteq z$.
\end{itemize}
The above definitions also apply to \textbf{facts} \emph{i.e.} clauses with an empty body and \textbf{constraints} \emph{i.e.} clauses with no head.
\end{frame}
\begin{frame}
\frametitle{Bayesian updates: Clauses Update}
A consistent observation \textbf{relevant} for a clause $h_1 \vee \cdots \vee h_n \leftarrow b$ should:
\begin{itemize}
\item Increase the \emph{probability of any matched case}.
\item Decrease the \emph{probability of any unmatched case}.
\end{itemize}
\pause
\begin{block}{Update algorithm}
\begin{enumerate}
\item Associate three \textbf{counters}, $r, u, n$, to each clause $h \leftarrow b$.
\item Associate a \textbf{counter}, $m_i$, to each case $h_i, b$ of each clause.
\item \textbf{Initial} values result from \emph{prior} knowledge.
\item Each \emph{consistent} observation \textbf{increments}:
\begin{itemize}
\item The $r$ counters of \alert{r}elevant clauses.
\item The $u$ counters of \alert{u}nmatched relevant clauses.
\item The $n$ counters of \alert{n}ot relevant clauses.
\item The $m_i$ counters of \alert{m}atched cases $h_i, b$.
\item Clause counters must verify $r \leq u + \sum_i m_i$.
\end{itemize}
\end{enumerate}
\end{block}
\end{frame}
\begin{frame}
\frametitle{Updates and counters: An example}
Given the following ASP program with \textbf{annotated counters},
$$
\begin{aligned}
%&H \leftarrow B&&\text{counters:}~ m_{1:n}; r, u, n \\
&b \vee \neg b &&\text{counters:}~ 7, 2; 12, 3, 0 \\
&h_1 \vee h_2 \leftarrow b &&\text{counters:}~ 4 , 3 ; 6, 2, 5
\end{aligned}
$$
\onslide*<2>{
\begin{columns}[t]
\begin{column}{0.5\textwidth}
\begin{block}{Counters of $b \vee \neg b$}\small
$0$ observations where not relevant (because the body is $\top$);
There where $12$ relevant observations;
Of those, $b$ was matched by $7$, $\neg b$ by $2$ and $3$ observations matched neither ($\models\naf b, \naf \neg b$).
\end{block}
\end{column}
\begin{column}{0.5\textwidth}
\begin{block}{Counters of $h_1 \vee h_2 \leftarrow b$}\small
There where $11 = 6 + 5$ observations, $6$ relevant to this clause;
From these, $4$ matched $h_1$, $3$ matched $h_2$ and $2$ matched no case.
\end{block}
\end{column}
\end{columns}
}
\onslide<3->
\begin{block}{What can be computed?}
\begin{itemize}
\item $P(\neg b) = \frac{2}{12}$ because $\neg b$ matched $2$ of $12$ relevant observations.
\item $P(h_1 | b) = \frac{4}{6}$ because $h_1$ matched $4$ of $6$ relevant observations.
\item $P(b)$ \alert{can't be computed} without further information. \emph{E.g.} supposing that \textbf{observations are independent} then
$$P(b) = \frac{7 + 6}{12 + 0 + 6 + 5}.$$
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\frametitle{Updates and counters: An example}
Given the following ASP program with \textbf{annotated counters},
$$
\begin{aligned}
%&H \leftarrow B&&\text{counters:}~ m_{1:n}; r, u, n \\
&b \vee \neg b &&\text{counters:}~ 7, 2; 12, 3, 0 \\
&h_1 \vee h_2 \leftarrow b &&\text{counters:}~ 4 , 3 ; 6, 2, 5
\end{aligned}
$$
\begin{block}{Note\ldots}
\onslide*<1>{Counters are local to clauses and, for distinct clauses, may result from distinct sources. \emph{E.g. the relevant counter of $h_1 \vee h_2 \leftarrow b$ and the match counter of $b$ in $b \vee \neg b$.}}
\onslide*<3>{Some observations may have neither $b$ nor $\neg b$: $$P(b) + P(\neg b) < 1.$$}
\onslide*<4>{Since $h_1$ and $h_2$ are not independent, $$\sum_m P(m) \approx 1.02 > 1.$$}
\onslide*<5>{What is missing to compute the \alert{joint distribution} of the program's atoms $$P(H_1, H_2, B)?$$}
\end{block}
\end{frame}
\begin{frame}
\frametitle{Shortcomming 2: Default Negation}
\begin{itemize}
\item How to deal with rules with $\naf a$ parts?
\item Should missing elements on observations be replaced with $\naf a$ atoms?
\end{itemize}
\end{frame}
\section{Conclusions}
\begin{frame}
\tableofcontents[currentsection]
\end{frame}
\section*{Background Material}
\begin{frame}
\Huge Background Material
\end{frame}
\begin{frame}{Machine Learning}
Models are numeric functions: $y \approx f_\theta(x),~\theta_i, x_j, y\in\mathbf{R}$.
\begin{itemize}
\item Amazing achievements.
\item Noise tolerant.
\item (as of today) Huge enterprise funding .
\end{itemize}
but
\begin{itemize}
\item (essentially) Academically solved.
\item Models trained from ``large'' amounts of samples.
\item Hard to add background knowledge.
\item Models are hard to interpret.
\item Single table, independent rows assumption.
\end{itemize}
\end{frame}
\begin{frame}{Inductive Logic Programming}
Models are logic program: $p_\theta(x, y),~\theta_i, x_j, y\in{\cal A}$.
\begin{itemize}
\item Amazing achievements, at scale.
\item Models trained from ``small'' amounts of samples.
\item Compact, readable models.
\item Background knowledge is easy to incorporate and edit.
\end{itemize}
but
\begin{itemize}
\item as of today, Little enterprise commitment.
\item as of today, Mostly academic interest.
\item Noise sensitive.
\end{itemize}
\end{frame}
\begin{frame}{Distribution Semantics}
Assigns probability to (marginally independent) facts and derives probability of ground propositions.
Let $F$ be set of facts, $S\subseteq F$, $R$ a set of definite clauses and $p$ a proposition:
$$\small
\begin{aligned}
P_F(S) &= \prod_{f \in S} P(f) \prod_{f \not\in S} \left(1 - P(f) \right) \cr
P(W) &= \sum_{S \subseteq F :~W=M(S\cup R)} P_F(S) \cr
P(p) &= \sum_{S :~ S\cup R ~\vdash~ p} P_F(S) = \sum_{W :~ p\in W} P(W)
\end{aligned}
$$
\begin{itemize}
\item Amazing achievements, at scale.
\item Lots of tools and research.
\item The best of both ``worlds''?
\end{itemize}
\end{frame}
\begin{frame}{Answer Set Programming}
A ``program'' defines stable models \emph{i.e.} minimal sets of derived ground atoms\footnote{Alternative \xout{fact} definition: $X$ is a stable model of $P$ if $X = \text{Cn}(P^X)$.}.
\begin{itemize}
\item Pure declarative language, unlike Prolog.
\item Uses \emph{generate \& test} methods instead of proofs .
\item Uses both default $\sim\!p$ and classical negation $\neg p$\footnote{Classic negation $\neg a$ in ASP results from replacing the occurrences of $\neg a$ by a new atom $a_\neg$ and adding the restriction $\leftarrow a_\neg, a$.}
\item Clauses can be disjunctive $a ; b \leftarrow c, d.$
\end{itemize}
\end{frame}
\subsection*{Stable Sets}
\begin{frame}
\tableofcontents[currentsection]
\end{frame}
\subsection*{References}
\begin{frame}
\tableofcontents[currentsection]
\end{frame}
\end{document}