add example repairs to appendix

breandan · Mar 30, 2024 · f7c457b · f7c457b
1 parent fac7e7c
commit f7c457b
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 10 deletions.
diff --git a/latex/splash2024/preamble.tex b/latex/splash2024/preamble.tex
@@ -96,13 +96,13 @@
 
 \usepackage[skins,breakable,listings]{tcolorbox}
 
-\lstdefinelanguage{kotlin}{
+\lstdefinelanguage{python}{
   comment=[l]{//},
   commentstyle={\color{gray}\ttfamily},
   emph={delegate, filter, firstOrNull, forEach, it, lazy, mapNotNull, println, repeat, assert, with, head, tail, len, return@},
   numberstyle=\noncopyable,
   identifierstyle=\color{black},
-  keywords={abstract, actual, as, as?, break, by, class, companion, continue, data, do, dynamic, else, enum, expect, false, final, for, fun, get, if, import, in, infix, interface, internal, is, null, object, open, operator, override, package, private, public, return, sealed, set, super, suspend, this, throw, true, try, catch, typealias, val, var, vararg, when, where, while, tailrec, reified},
+  keywords={abstract, actual, as, as?, break, by, class, companion, continue, data, do, dynamic, else, enum, expect, false, final, for, fun, get, if, import, in, infix, interface, internal, is, null, object, open, operator, override, package, private, public, return, sealed, set, super, suspend, this, throw, true, try, catch, typealias, val, var, vararg, when, where, while, tailrec, reified, from, import, def, yield, lambda, as, in, return, else, pass},
   keywordstyle={\bfseries},
   morecomment=[s]{/*}{*/},
   morestring=[b]",
@@ -114,6 +114,7 @@
   literate={`}{{\char0}}1,
   escapeinside={(*@}{@*)}
 }
+
 \lstdefinelanguage{tidy}{
   comment=[l]{//},
   commentstyle={\color{gray}\ttfamily},

diff --git a/latex/splash2024/splash.pdf b/latex/splash2024/splash.pdf
diff --git a/latex/splash2024/splash.tex b/latex/splash2024/splash.tex
@@ -243,7 +243,7 @@
 
   Likewise, a finite state automaton is a quintuple $\mathcal{A} = \langle Q, \Sigma, \delta, I, F\rangle$, where $Q$ is a finite set of states, $\Sigma$ is a finite alphabet, $\delta \subseteq Q \times \Sigma \times Q$ is the transition function, and $I, F \subseteq Q$ are the set of initial and final states, respectively. We will adhere to this notation in the following sections.
 
-  \pagebreak\subsection{The nominal Levenshtein automaton}\label{sec:lev_nfa}
+  \pagebreak\subsection{Modeling code edits with the Levenshtein automaton}\label{sec:lev_nfa}
 
   \begin{wrapfigure}{r}{0.5\textwidth}
     \vspace{-0.3cm}
@@ -446,7 +446,7 @@
 
   Nominalizing the NFA eliminates the creation of $e=2(|\Sigma| - 1)\cdot|\sigma|\cdot d_\max$ unnecessary arcs over the entire Levenshtein automaton and drastically reduces the size of the construction to follow, but does not affect the underlying semantics. Thus, it is essential to first nominalize the automaton before proceeding to avoid a large blowup in the intermediate grammar.
 
-  \subsection{Levenshtein-Bar-Hillel Construction}\label{sec:lev_bh}
+  \subsection{Recognizing syntactically valid code changes via language intersection}\label{sec:lev_bh}
 
   We now describe the Bar-Hillel construction, which generates a grammar recognizing the intersection between a regular and a context-free language, then specialize it to Levenshtein intersections.
 
@@ -1265,20 +1265,115 @@
 % References will then be sorted and formatted in the correct style.
 %
 %  \bibliographystyle{splncs04}
-  \bibliography{../bib/acmart}
+  \pagebreak\bibliography{../bib/acmart}
 
-\pagebreak \appendix
+  \pagebreak\appendix
 
-  \section{Raw data} \label{sec:appendix}
+  \section{Example Repairs}\label{sec:exaple_repairs}
 
-  Raw data from Precision@k experiments across snippet length and Levenshtein distance from \S~\ref{sec:stackoverflow}.
+  Below, we provide a few examples of broken code snippets and their corresponding human repairs that were successfully discovered and ranked first by our method. On the left is a complete snippet fed to the model and on the right, the corresponding human repair that was correctly predicted.
+
+  \begin{figure}[H]
+      \begin{tabular}{|m{6.6cm}|m{6.6cm}|}
+        \hline \rule{0pt}{2.5ex}\textbf{Original broken code}\rule[-1ex]{0pt}{2ex} &  \rule{0pt}{2.5ex}\textbf{First predicted repair}\rule[-1ex]{0pt}{2ex} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  (*@\hlorange{form}@*) sympy import *
+  x = Symbol('x', real=True)
+  x, re(x), im(x)
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  (*@\hlorange{\textbf{from}}@*) sympy import *
+  x = Symbol('x', real=True)
+  x, re(x), im(x)
+
+        \end{lstlisting} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  result = (*@\hlorange{yeald}@*) From(item.create())
+  raise Return(result)
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  result = (*@\hlorange{\textbf{yield}}@*) From(item.create())
+  raise Return(result)
+
+        \end{lstlisting} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  return 1/sum_p if sum_p \
+  (*@\hlorange{\textbf{return}}@*) 0 (*@\hlred{\textbf{else}}@*)
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  return 1/sum_p if sum_p \
+  (*@\hlorange{\textbf{else}}@*) 0
+
+        \end{lstlisting} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  sum(len(v) for v items.values())(*@\hlred{)}@*)
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  sum(len(v) for v (*@\hlgreen{\textbf{in}}@*) items.values())
+
+        \end{lstlisting} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  df.apply(lambda row: list(set(row['ids'(*@\hlorange{)}@*))))
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  df.apply(lambda row: list(set(row['ids'(*@\hlorange{]}@*))))
+
+        \end{lstlisting} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  import numpy (*@\hlorange{ad}@*) np
+  A_concate = np.array([a_0, a_1, a_2,..., a_n])
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  import numpy (*@\hlorange{\textbf{as}}@*) np
+  A_concate = np.array([a_0, a_1, a_2,..., a_n])
+
+        \end{lstlisting} \\\hline
+        \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  class MixIn(object)
+    def m():
+      pass
+
+  class classA(MixIn):
+
+  class classB(MixIn):
+
+        \end{lstlisting} & \begin{lstlisting}[basicstyle=\ttfamily\lst@ifdisplaystyle\footnotesize\fi, language=python]
+
+  class MixIn(object)(*@\hlgreen{:}@*)
+    def m():
+      pass
+
+  class classA(MixIn): (*@\hlgreen{\textbf{pass}}@*)
+
+  class classB(MixIn): (*@\hlgreen{\textbf{pass}}@*)
+
+        \end{lstlisting} \\\hline
+      \end{tabular}
+  \end{figure}
+
+  \clearpage\section{Raw data}\label{sec:raw_prec_data}
+
+  Raw data from Precision@k experiments across snippet length and Levenshtein distance from \S~\ref{sec:stackoverflow}. $|\err\sigma|$ indicates the snippet length and $\Delta$ indicates the Levenshtein distance between the broken and code and human fix computed over lexical tokens.
 
   \begin{table}[!h]
     \centering
     \begin{tabular}{c|c|cccccccc}
       \hline\hline
       & $\Delta$ & \multicolumn{8}{c}{Precision@1} \\ \hline
-      $|\sigma|$ &  & $(0,10)$ & $[10,20)$ & $[20,30)$ & $[30, 40)$ & $[40,50)$ & $[50, 60)$ & $[60,70)$ & $[70, 80)$ \\ \hline
+      $|\err\sigma|$ &  & $(0,10)$ & $[10,20)$ & $[20,30)$ & $[30, 40)$ & $[40,50)$ & $[50, 60)$ & $[60,70)$ & $[70, 80)$ \\ \hline
       Tidyparse
       & 1 & 1.00 & 1.00 & 0.98 & 0.98 & 1.00 & 1.00 & 0.95 & 0.90 \\
       & 2 & 0.51 & 0.36 & 0.24 & 0.26 & 0.24 & 0.23 & 0.12 & 0.10 \\
@@ -1302,5 +1397,6 @@
       & 3 & 0.20 & 0.13 & 0.08 & 0.17 & 0.15 & 0.18 & 0.17 & 0.07 \\ \hline\hline
     \end{tabular}
   \end{table}
+\end{document}
+
 
-\end{document}
diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/repair/SyntaxRepair.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/repair/SyntaxRepair.kt
@@ -14,6 +14,7 @@ var CFG_THRESH = 20_000
 var MAX_UNIQUE = 20_000 // Maximum number of unique samples to generate
 var MAX_SAMPLE = 20 // Maximum number of repairs to sample
 var MAX_TOKENS = 40 // Maximum number of tokens per repair
+var MIN_TOKENS = 3
 var MAX_RADIUS = 3
 var TIMEOUT_MS = 90_000 // Timeout for each repair attempt (default, modify elsewhere)
 var MAX_REPAIR = 2 // Maximum number of edits per repair