probast_draf.tex 14 KB
\documentclass[bigger]{beamer}
\useinnertheme{circles}
\usefonttheme[onlymath]{serif}
\usefonttheme{structurebold}
\setbeamertemplate{navigation symbols}{}
\usepackage{xcolor}
\setbeamercolor{highlight block}{bg=gray}

\usepackage{tikz}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[normalem]{ulem} % To strikeout
\usepackage{commath}

\newcommand{\naf}{\ensuremath{\sim\!\!}}

\title{Probabilistic Answer Set Programming}
\subtitle{A Research Draft}
\author{Francisco Coelho}
\institute[\texttt{fc@uevora.pt}]{
    NOVA LINCS \&\\
    High Performance Computing Chair \&\\
    Departamento de Informática, Universidade de Évora
}

\begin{document}
    \begin{frame}[plain]
        \titlepage
    \end{frame}
    
    \section*{Motivation}

    \begin{frame}
        \frametitle{In short\ldots\hfill\small \ldots a word wall. I'm sorry.}
    
        \begin{itemize}
            \item \textbf{Machine Learning} has important limitations:
            \begin{itemize}
                \item The \emph{one table, conditionally independent rows} assumption.
                \item \emph{Background knowledge} is hard to include.
                \item \emph{Training} requires ``large'' amounts of data.
                \item \emph{Models} are hard do interpret.
            \end{itemize}
            \item \textbf{Inductive Logic Programming} is based on first order logic --- solves all the problems above but is sensible to \emph{noise}.
            \item \textbf{Distribution Semantics} defines the probability of a proposition from  probabilities of the (marginally independent) facts.
            \item \textbf{Answer Set Programs} resets the common syntax and semantic of logic programs;  A ``program'' defines \emph{stable models}, not a computation neither a variable substitution.  
        \end{itemize}    
    \end{frame}

    \begin{frame}
        \frametitle{\xout{Goals} Wish list}

        \begin{block}{Extend distribution semantics to answer sets}
            \begin{itemize}
                \item Within a theoretical framework.
                \item Computationally applicable to ``real world'' scenarios.
                \item Easy to include background knowledge.
                \item Perform common tasks such as \emph{marg, mle, map, etc.}
                \item Learn program ``parameters'' and ``structure'' from \emph{noisy samples} --- possibly using \emph{templates}.
                \item Related to Bayesian Networks, HMMs, \emph{etc.}  
            \end{itemize}    
            
        \end{block}
    \end{frame}
    
    \section{Development}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}

    \begin{frame}
        \frametitle{The seed on an idea}
        We want to define the \textbf{joint distribution} of the stable models.
        \begin{enumerate} 
            \item A \textbf{boolean random variable} can be described by a disjunction $a; \neg a$.
            \item This ASP program has two stable models: $a$ and $\neg a$.
            \item A program with $n$ such facts $a_i; \neg a_i$ has $2^n$ stable models, the distinct combinations of those choices. 
            \item \textbf{If each $a_i$ has probability $p_i$ then the probability of a stable model $W$ would be} $$P(W) = \prod_{a_i \in W}p_i \prod_{\neg a_i \in W} (1 - p_i).$$            
        \end{enumerate}        
        \pause
        \begin{alertblock}{But this is wrong.}
            Even assuming that those facts are marginally independent --- which we will do.
        \end{alertblock}
    \end{frame}

    \begin{frame}{Problem 1: Disjuntive Clauses}
        The ASP program with probabilistic facts
        $$        
        \begin{aligned}
            &b \vee \neg b\\
            &h_1 \vee h_2 \leftarrow b
        \end{aligned}
        $$
        has \textbf{three} stable models: $\set{\neg b}, \set{b, h_1}$ and $\set{b, h_2}$.  

        \begin{block}{How to assign a probability to each model?}
            \pause 
            Possible approaches:
            \begin{enumerate}
                \item Pre-assign a \textbf{conditional distribution of the head}:
                $$P(h_1, h_2 | b).$$
                \item Bayesian learn from \textbf{observations}:
                $$P(h_1, h_2 | b,z) \propto P(b, z | h_1, h_2) P(h_1, h_2).$$
                \item Start with the former as \textbf{prior} and \textbf{update} with the latter.
            \end{enumerate}     
        \end{block}       
    
    \end{frame}

    \begin{frame}
        \frametitle{Questions to address}
    
        \begin{itemize}
            \item How to \textbf{match} an observation $z$ with a clause case $h,b$?
            \item How do observations \textbf{update} the probabilities?
            \item Why match observations with clauses and \textbf{not with stable models}?
            \item Is this just \textbf{bayesian networking}?
            \item How to frame this in a \textbf{sound theoretic setting}?
            \item Is this enough to compute the \textbf{joint distribution of the atoms}?
        \end{itemize}
        \onslide<2->
        \begin{exampleblock}{Counters}
            Instead of setting and updating probabilities, we associate \textbf{counters} to disjunctive clauses and their cases.
        \end{exampleblock}
    
    \end{frame}

    \begin{frame}
        \frametitle{Bayesian updates: Matching observations}

        \begin{itemize}
            \item An \alert{observation} is a subset of the literals from a program\footnote{The set of atoms, $a$, of the program and their classic negations, $\neg a$.}.
            \item A \alert{consistent} observation has no subset $\set{p, \neg p}$.
            \item A \textbf{consistent} observation $z$ is \alert{relevant} for the clause $h \leftarrow b$ if $b \subseteq z$.
            \item A disjunctive clause $$h_1 \vee \cdots \vee h_n \leftarrow b_1 \wedge \cdots \wedge b_m$$ has $n$ \alert{cases}: $\set{h_i, b_1, \ldots, b_m}, i = 1:n$.
            \item The \textbf{consistent} observation $z$ \alert{matches} the case $\set{h, b_\ast}$ if 
            $\set{h, b_\ast} \subseteq z$.     
        \end{itemize}
        The above definitions also apply to \textbf{facts} \emph{i.e.} clauses with an empty body and \textbf{constraints} \emph{i.e.} clauses with no head.
    \end{frame}

    \begin{frame}
        \frametitle{Bayesian updates: Clauses Update}
        A consistent observation \textbf{relevant} for a clause $h_1 \vee \cdots \vee h_n \leftarrow b$ should:
            \begin{itemize}
                \item Increase the \emph{probability of any matched case}.
                \item Decrease the \emph{probability of any unmatched case}.
            \end{itemize}
        \pause
        \begin{block}{Update algorithm}
            \begin{enumerate}
                \item Associate three \textbf{counters}, $r, u, n$, to each clause $h \leftarrow b$.
                \item Associate a \textbf{counter}, $m_i$, to each case $h_i, b$ of each clause.
                \item \textbf{Initial} values result from \emph{prior} knowledge.
                \item Each \emph{consistent} observation \textbf{increments}:
                \begin{itemize}
                    \item The $r$ counters of \alert{r}elevant clauses.
                    \item The $u$ counters of \alert{u}nmatched relevant clauses. 
                    \item The $n$ counters of \alert{n}ot relevant clauses.
                    \item The $m_i$ counters of \alert{m}atched cases $h_i, b$.
                    \item Clause counters must verify $r \leq u + \sum_i m_i$.
                \end{itemize}
            \end{enumerate}  
        \end{block}
    \end{frame}

    \begin{frame}
        \frametitle{Updates and counters: An example}
        Given the following ASP program with \textbf{annotated counters},
        $$        
        \begin{aligned}
            %&H \leftarrow B&&\text{counters:}~ m_{1:n}; r, u, n \\
            &b \vee \neg b &&\text{counters:}~ 7, 2; 12, 3, 0 \\
            &h_1 \vee h_2 \leftarrow b  &&\text{counters:}~ 4 , 3 ; 6, 2, 5
        \end{aligned}
        $$
        \onslide*<2>{
            \begin{columns}[t]
                \begin{column}{0.5\textwidth}                    
                \begin{block}{Counters of $b \vee \neg b$}\small
                    $0$ observations where not relevant (because the body is $\top$); 
                    
                    There where $12$ relevant observations; 
                    
                    Of those, $b$ was matched by $7$, $\neg b$ by $2$ and $3$ observations matched neither ($\models\naf b, \naf \neg b$).
                \end{block}
                \end{column}
                \begin{column}{0.5\textwidth}                    
                \begin{block}{Counters of $h_1 \vee h_2 \leftarrow b$}\small
                    There where $11 = 6 + 5$ observations, $6$ relevant to this clause; 
                    
                    From these, $4$ matched $h_1$, $3$ matched $h_2$ and $2$ matched no case.
                \end{block}
                \end{column}
            \end{columns}
        }
        \onslide<3->
        \begin{block}{What can be computed?}
            \begin{itemize}
                \item $P(\neg b) = \frac{2}{12}$ because $\neg b$ matched $2$ of $12$ relevant observations.
                \item $P(h_1 | b) = \frac{4}{6}$ because $h_1$ matched $4$ of $6$ relevant observations.
                \item $P(b)$ \alert{can't be computed} without further information. \emph{E.g.} supposing that \textbf{observations are independent} then
                $$P(b) = \frac{7 + 6}{12 + 0 + 6 + 5}.$$ 
            \end{itemize}
            
        \end{block}
    \end{frame}

    \begin{frame}
        \frametitle{Updates and counters: An example}
        Given the following ASP program with \textbf{annotated counters},
        $$        
        \begin{aligned}
            %&H \leftarrow B&&\text{counters:}~ m_{1:n}; r, u, n \\
            &b \vee \neg b &&\text{counters:}~ 7, 2; 12, 3, 0 \\
            &h_1 \vee h_2 \leftarrow b  &&\text{counters:}~ 4 , 3 ; 6, 2, 5
        \end{aligned}
        $$
        \begin{block}{Note\ldots}
            \onslide*<1>{Counters are local to clauses and, for distinct clauses, may result from distinct sources. \emph{E.g. the relevant counter of $h_1 \vee h_2 \leftarrow b$ and the match counter of $b$ in $b \vee \neg b$.}}
            \onslide*<3>{Some observations may have neither $b$ nor $\neg b$: $$P(b) + P(\neg b) < 1.$$}
            \onslide*<4>{Since $h_1$ and $h_2$ are not independent, $$\sum_m P(m) \approx 1.02 > 1.$$}
            \onslide*<5>{What is missing to compute the \alert{joint distribution} of the program's atoms $$P(H_1, H_2, B)?$$}
        \end{block}
    \end{frame}
    
    \begin{frame}
        \frametitle{Shortcomming 2: Default Negation}
    
        \begin{itemize}
            \item How to deal with rules with $\naf a$ parts?
            \item Should missing elements on observations be replaced with $\naf a$ atoms?
        \end{itemize}
    \end{frame}
    \section{Conclusions}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}
    
    \section*{Background Material}
    
    \begin{frame}
        \Huge Background Material
    \end{frame}

    \begin{frame}{Machine Learning}
        Models are numeric functions: $y \approx f_\theta(x),~\theta_i, x_j, y\in\mathbf{R}$.
        \begin{itemize}
            \item Amazing achievements.
            \item Noise tolerant.
            \item (as of today) Huge enterprise funding .
        \end{itemize}
        but
        \begin{itemize}
            \item (essentially) Academically solved.
            \item Models trained from ``large'' amounts of samples.
            \item Hard to add background knowledge.
            \item Models are hard to interpret.
            \item Single table, independent rows assumption.
        \end{itemize}
    \end{frame}
    
    \begin{frame}{Inductive Logic Programming}
        Models are logic program: $p_\theta(x, y),~\theta_i, x_j, y\in{\cal A}$.
        \begin{itemize}
            \item Amazing achievements, at scale.
            \item Models trained from ``small'' amounts of samples.
            \item Compact, readable models.
            \item Background knowledge is easy to incorporate and edit.
        \end{itemize}
        but
        \begin{itemize}
            \item as of today, Little enterprise commitment.
            \item as of today, Mostly academic interest.
            \item Noise sensitive.
        \end{itemize}
    \end{frame}

    \begin{frame}{Distribution Semantics}
        Assigns probability to (marginally independent) facts and derives probability of ground propositions.

        Let $F$ be set of facts, $S\subseteq F$, $R$ a set of definite clauses and $p$ a proposition:
        $$\small
        \begin{aligned}
            P_F(S) &= \prod_{f \in S} P(f) \prod_{f \not\in S} \left(1 - P(f) \right) \cr
            P(W) &= \sum_{S \subseteq F :~W=M(S\cup R)} P_F(S) \cr
            P(p) &= \sum_{S :~ S\cup R ~\vdash~ p} P_F(S) = \sum_{W :~ p\in W} P(W)
        \end{aligned}
        $$
        \begin{itemize}
            \item Amazing achievements, at scale.
            \item Lots of tools and research.
            \item The best of both ``worlds''?
        \end{itemize}
        
    \end{frame}

    \begin{frame}{Answer Set Programming}
        A ``program'' defines stable models \emph{i.e.} minimal sets of derived ground atoms\footnote{Alternative \xout{fact} definition: $X$ is a stable model of $P$ if $X = \text{Cn}(P^X)$.}.  
        \begin{itemize}
            \item Pure declarative language, unlike Prolog.
            \item Uses \emph{generate \& test} methods instead of proofs .
            \item Uses both default $\sim\!p$ and classical negation $\neg p$\footnote{Classic negation $\neg a$ in ASP results from replacing the occurrences of $\neg a$ by a new atom $a_\neg$ and adding the restriction $\leftarrow a_\neg, a$.}
            \item Clauses can be disjunctive $a ; b \leftarrow c, d.$
        \end{itemize}
    \end{frame}

    \subsection*{Stable Sets}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}

    \subsection*{References}
    
    \begin{frame}
        \tableofcontents[currentsection]
    \end{frame}
\end{document}