probast_draf.tex 14 KB
Edit Raw Blame History



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323


\documentclass[bigger]{beamer}
\useinnertheme{circles}
\usefonttheme[onlymath]{serif}
\usefonttheme{structurebold}
\setbeamertemplate{navigation symbols}{}
\usepackage{xcolor}
\setbeamercolor{highlight block}{bg=gray}

\usepackage{tikz}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[normalem]{ulem} % To strikeout
\usepackage{commath}

\newcommand{\naf}{\ensuremath{\sim\!\!}}

\title{Probabilistic Answer Set Programming}
\subtitle{A Research Draft}
\author{Francisco Coelho}
\institute[\texttt{fc@uevora.pt}]{
    NOVA LINCS \&\\
    High Performance Computing Chair \&\\
    Departamento de Informática, Universidade de Évora
}

\begin{document}
    \begin{frame}[plain]
        \titlepage
    \end{frame}
    
    \section*{Motivation}

    \begin{frame}
        \frametitle{In short\ldots\hfill\small \ldots a word wall. I'm sorry.}
    
        \begin{itemize}
            \item \textbf{Machine Learning} has important limitations:
            \begin{itemize}
                \item The \emph{one table, conditionally independent rows} assumption.
                \item \emph{Background knowledge} is hard to include.
                \item \emph{Training} requires ``large'' amounts of data.
                \item \emph{Models} are hard do interpret.
            \end{itemize}
            \item \textbf{Inductive Logic Programming} is based on first order logic --- solves all the problems above but is sensible to \emph{noise}.
            \item \textbf{Distribution Semantics} defines the probability of a proposition from  probabilities of the (marginally independent) facts.
            \item \textbf{Answer Set Programs} resets the common syntax and semantic of logic programs;  A ``program'' defines \emph{stable models}, not a computation neither a variable substitution.  
        \end{itemize}    
    \end{frame}

    \begin{frame}
        \frametitle{\xout{Goals} Wish list}

        \begin{block}{Extend distribution semantics to answer sets}
            \begin{itemize}
                \item Within a theoretical framework.
                \item Computationally applicable to ``real world'' scenarios.
                \item Easy to include background knowledge.
                \item Perform common tasks such as \emph{marg, mle, map, etc.}
                \item Learn program ``parameters'' and ``structure'' from \emph{noisy samples} --- possibly using \emph{templates}.
                \item Related to Bayesian Networks, HMMs, \emph{etc.}  
            \end{itemize}    
            
        \end{block}
    \end{frame}
    
    \section{Development}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}

    \begin{frame}
        \frametitle{The seed on an idea}
        We want to define the \textbf{joint distribution} of the stable models.
        \begin{enumerate} 
            \item A \textbf{boolean random variable} can be described by a disjunction $a; \neg a$.
            \item This ASP program has two stable models: $a$ and $\neg a$.
            \item A program with $n$ such facts $a_i; \neg a_i$ has $2^n$ stable models, the distinct combinations of those choices. 
            \item \textbf{If each $a_i$ has probability $p_i$ then the probability of a stable model $W$ would be} $$P(W) = \prod_{a_i \in W}p_i \prod_{\neg a_i \in W} (1 - p_i).$$            
        \end{enumerate}        
        \pause
        \begin{alertblock}{But this is wrong.}
            Even assuming that those facts are marginally independent --- which we will do.
        \end{alertblock}
    \end{frame}

    \begin{frame}{Problem 1: Disjuntive Clauses}
        The ASP program with probabilistic facts
        $$        
        \begin{aligned}
            &b \vee \neg b\\
            &h_1 \vee h_2 \leftarrow b
        \end{aligned}
        $$
        has \textbf{three} stable models: $\set{\neg b}, \set{b, h_1}$ and $\set{b, h_2}$.  

        \begin{block}{How to assign a probability to each model?}
            \pause 
            Possible approaches:
            \begin{enumerate}
                \item Pre-assign a \textbf{conditional distribution of the head}:
                $$P(h_1, h_2 | b).$$
                \item Bayesian learn from \textbf{observations}:
                $$P(h_1, h_2 | b,z) \propto P(b, z | h_1, h_2) P(h_1, h_2).$$
                \item Start with the former as \textbf{prior} and \textbf{update} with the latter.
            \end{enumerate}     
        \end{block}       
    
    \end{frame}

    \begin{frame}
        \frametitle{Questions to address}
    
        \begin{itemize}
            \item How to \textbf{match} an observation $z$ with a clause case $h,b$?
            \item How do observations \textbf{update} the probabilities?
            \item Why match observations with clauses and \textbf{not with stable models}?
            \item Is this just \textbf{bayesian networking}?
            \item How to frame this in a \textbf{sound theoretic setting}?
            \item Is this enough to compute the \textbf{joint distribution of the atoms}?
        \end{itemize}
        \onslide<2->
        \begin{exampleblock}{Counters}
            Instead of setting and updating probabilities, we associate \textbf{counters} to disjunctive clauses and their cases.
        \end{exampleblock}
    
    \end{frame}

    \begin{frame}
        \frametitle{Bayesian updates: Matching observations}

        \begin{itemize}
            \item An \alert{observation} is a subset of the literals from a program\footnote{The set of atoms, $a$, of the program and their classic negations, $\neg a$.}.
            \item A \alert{consistent} observation has no subset $\set{p, \neg p}$.
            \item A \textbf{consistent} observation $z$ is \alert{relevant} for the clause $h \leftarrow b$ if $b \subseteq z$.
            \item A disjunctive clause $$h_1 \vee \cdots \vee h_n \leftarrow b_1 \wedge \cdots \wedge b_m$$ has $n$ \alert{cases}: $\set{h_i, b_1, \ldots, b_m}, i = 1:n$.
            \item The \textbf{consistent} observation $z$ \alert{matches} the case $\set{h, b_\ast}$ if 
            $\set{h, b_\ast} \subseteq z$.     
        \end{itemize}
        The above definitions also apply to \textbf{facts} \emph{i.e.} clauses with an empty body and \textbf{constraints} \emph{i.e.} clauses with no head.
    \end{frame}

    \begin{frame}
        \frametitle{Bayesian updates: Clauses Update}
        A consistent observation \textbf{relevant} for a clause $h_1 \vee \cdots \vee h_n \leftarrow b$ should:
            \begin{itemize}
                \item Increase the \emph{probability of any matched case}.
                \item Decrease the \emph{probability of any unmatched case}.
            \end{itemize}
        \pause
        \begin{block}{Update algorithm}
            \begin{enumerate}
                \item Associate three \textbf{counters}, $r, u, n$, to each clause $h \leftarrow b$.
                \item Associate a \textbf{counter}, $m_i$, to each case $h_i, b$ of each clause.
                \item \textbf{Initial} values result from \emph{prior} knowledge.
                \item Each \emph{consistent} observation \textbf{increments}:
                \begin{itemize}
                    \item The $r$ counters of \alert{r}elevant clauses.
                    \item The $u$ counters of \alert{u}nmatched relevant clauses. 
                    \item The $n$ counters of \alert{n}ot relevant clauses.
                    \item The $m_i$ counters of \alert{m}atched cases $h_i, b$.
                    \item Clause counters must verify $r \leq u + \sum_i m_i$.
                \end{itemize}
            \end{enumerate}  
        \end{block}
    \end{frame}

    \begin{frame}
        \frametitle{Updates and counters: An example}
        Given the following ASP program with \textbf{annotated counters},
        $$        
        \begin{aligned}
            %&H \leftarrow B&&\text{counters:}~ m_{1:n}; r, u, n \\
            &b \vee \neg b &&\text{counters:}~ 7, 2; 12, 3, 0 \\
            &h_1 \vee h_2 \leftarrow b  &&\text{counters:}~ 4 , 3 ; 6, 2, 5
        \end{aligned}
        $$
        \onslide*<2>{
            \begin{columns}[t]
                \begin{column}{0.5\textwidth}                    
                \begin{block}{Counters of $b \vee \neg b$}\small
                    $0$ observations where not relevant (because the body is $\top$); 
                    
                    There where $12$ relevant observations; 
                    
                    Of those, $b$ was matched by $7$, $\neg b$ by $2$ and $3$ observations matched neither ($\models\naf b, \naf \neg b$).
                \end{block}
                \end{column}
                \begin{column}{0.5\textwidth}                    
                \begin{block}{Counters of $h_1 \vee h_2 \leftarrow b$}\small
                    There where $11 = 6 + 5$ observations, $6$ relevant to this clause; 
                    
                    From these, $4$ matched $h_1$, $3$ matched $h_2$ and $2$ matched no case.
                \end{block}
                \end{column}
            \end{columns}
        }
        \onslide<3->
        \begin{block}{What can be computed?}
            \begin{itemize}
                \item $P(\neg b) = \frac{2}{12}$ because $\neg b$ matched $2$ of $12$ relevant observations.
                \item $P(h_1 | b) = \frac{4}{6}$ because $h_1$ matched $4$ of $6$ relevant observations.
                \item $P(b)$ \alert{can't be computed} without further information. \emph{E.g.} supposing that \textbf{observations are independent} then
                $$P(b) = \frac{7 + 6}{12 + 0 + 6 + 5}.$$ 
            \end{itemize}
            
        \end{block}
    \end{frame}

    \begin{frame}
        \frametitle{Updates and counters: An example}
        Given the following ASP program with \textbf{annotated counters},
        $$        
        \begin{aligned}
            %&H \leftarrow B&&\text{counters:}~ m_{1:n}; r, u, n \\
            &b \vee \neg b &&\text{counters:}~ 7, 2; 12, 3, 0 \\
            &h_1 \vee h_2 \leftarrow b  &&\text{counters:}~ 4 , 3 ; 6, 2, 5
        \end{aligned}
        $$
        \begin{block}{Note\ldots}
            \onslide*<1>{Counters are local to clauses and, for distinct clauses, may result from distinct sources. \emph{E.g. the relevant counter of $h_1 \vee h_2 \leftarrow b$ and the match counter of $b$ in $b \vee \neg b$.}}
            \onslide*<3>{Some observations may have neither $b$ nor $\neg b$: $$P(b) + P(\neg b) < 1.$$}
            \onslide*<4>{Since $h_1$ and $h_2$ are not independent, $$\sum_m P(m) \approx 1.02 > 1.$$}
            \onslide*<5>{What is missing to compute the \alert{joint distribution} of the program's atoms $$P(H_1, H_2, B)?$$}
        \end{block}
    \end{frame}
    
    \begin{frame}
        \frametitle{Shortcomming 2: Default Negation}
    
        \begin{itemize}
            \item How to deal with rules with $\naf a$ parts?
            \item Should missing elements on observations be replaced with $\naf a$ atoms?
        \end{itemize}
    \end{frame}
    \section{Conclusions}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}
    
    \section*{Background Material}
    
    \begin{frame}
        \Huge Background Material
    \end{frame}

    \begin{frame}{Machine Learning}
        Models are numeric functions: $y \approx f_\theta(x),~\theta_i, x_j, y\in\mathbf{R}$.
        \begin{itemize}
            \item Amazing achievements.
            \item Noise tolerant.
            \item (as of today) Huge enterprise funding .
        \end{itemize}
        but
        \begin{itemize}
            \item (essentially) Academically solved.
            \item Models trained from ``large'' amounts of samples.
            \item Hard to add background knowledge.
            \item Models are hard to interpret.
            \item Single table, independent rows assumption.
        \end{itemize}
    \end{frame}
    
    \begin{frame}{Inductive Logic Programming}
        Models are logic program: $p_\theta(x, y),~\theta_i, x_j, y\in{\cal A}$.
        \begin{itemize}
            \item Amazing achievements, at scale.
            \item Models trained from ``small'' amounts of samples.
            \item Compact, readable models.
            \item Background knowledge is easy to incorporate and edit.
        \end{itemize}
        but
        \begin{itemize}
            \item as of today, Little enterprise commitment.
            \item as of today, Mostly academic interest.
            \item Noise sensitive.
        \end{itemize}
    \end{frame}

    \begin{frame}{Distribution Semantics}
        Assigns probability to (marginally independent) facts and derives probability of ground propositions.

        Let $F$ be set of facts, $S\subseteq F$, $R$ a set of definite clauses and $p$ a proposition:
        $$\small
        \begin{aligned}
            P_F(S) &= \prod_{f \in S} P(f) \prod_{f \not\in S} \left(1 - P(f) \right) \cr
            P(W) &= \sum_{S \subseteq F :~W=M(S\cup R)} P_F(S) \cr
            P(p) &= \sum_{S :~ S\cup R ~\vdash~ p} P_F(S) = \sum_{W :~ p\in W} P(W)
        \end{aligned}
        $$
        \begin{itemize}
            \item Amazing achievements, at scale.
            \item Lots of tools and research.
            \item The best of both ``worlds''?
        \end{itemize}
        
    \end{frame}

    \begin{frame}{Answer Set Programming}
        A ``program'' defines stable models \emph{i.e.} minimal sets of derived ground atoms\footnote{Alternative \xout{fact} definition: $X$ is a stable model of $P$ if $X = \text{Cn}(P^X)$.}.  
        \begin{itemize}
            \item Pure declarative language, unlike Prolog.
            \item Uses \emph{generate \& test} methods instead of proofs .
            \item Uses both default $\sim\!p$ and classical negation $\neg p$\footnote{Classic negation $\neg a$ in ASP results from replacing the occurrences of $\neg a$ by a new atom $a_\neg$ and adding the restriction $\leftarrow a_\neg, a$.}
            \item Clauses can be disjunctive $a ; b \leftarrow c, d.$
        \end{itemize}
    \end{frame}

    \subsection*{Stable Sets}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}

    \subsection*{References}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}
\end{document}