paper/lipics-v2021-bplustrees.tex


\documentclass[a4paper,UKenglish,cleveref, cref, thm-restate]{lipics-v2021}
%This is a template for producing LIPIcs articles. 
%See lipics-v2021-authors-guidelines.pdf for further information.
%for A4 paper format use option "a4paper", for US-letter use option "letterpaper"
%for british hyphenation rules use option "UKenglish", for american hyphenation rules use option "USenglish"
%for section-numbered lemmas etc., use "numberwithinsect"
%for enabling cleveref support, use "cleveref"
%for enabling cref support, use "cref"
%for anonymousing the authors (e.g. for double-blind review), add "anonymous"
%for enabling thm-restate support, use "thm-restate"
%for enabling a two-column layout for the author/affilation part (only applicable for > 6 authors), use "authorcolumns"
%for producing a PDF according the PDF/A standard, add "pdfa"

% listing language definitions for several language of the following kinds:
% * ontology languages
% * markup languages
% * other semantic web languages
% * other languages occurring in the above contexts
% 
% for some related languages see also https://svn.kwarc.info/repos/stex/trunk/sty/etc/lstomdoc.sty
% 
% compiled by Christoph Lange (Universität Bremen, Jacobs University Bremen, University of Birmingham)
% 2010–2013
% math.semantic.web@gmail.com
% 
% https://github.com/clange/latex
% 
% For Mizar, see https://raw.github.com/JUrban/mizarmode/master/lstlangmizar.sty by Josef Urban

% Isabelle
% http://isabelle.in.tum.de
% partial specification; poor man's alternative to Isabelle's own LaTeX export
% but see also pmisabelle.sty
%\newif\iflst@instring\lst@instringfalse
%\newcommand*{\lst@eat}[1]{}%
%\newcommand*{\togglelst@instring}{%
%\upshape%
%\global\lst@instringfalse''
%}
\RequirePackage{xcolor}
\lstdefinelanguage{isabelle}{
  morekeywords={theorem,theorems,corollary,lemma,lemmas,locale,begin,end,fixes,assumes,shows,
    constrains , definition, where, apply, done,unfolding, primrec, using, by, for, uses,
    schematic_lemma, concrete_definition, prepare_code_thms, export_code, datatype,
    proof, next, qed, show, have, hence, thus, interpretation, fix, context, fun, partial_function
 } ,
  morekeywords=[2]{rec, return, bind, foreach, if, then, else, and, do, let, in, res, spec, fail, assert, while, case, of},
%  morekeywords=[3]{length,mod,insert},
%   morekeywords=[4]{simp,auto,intro,elim,rprems,refine_mono,refine_rcg},
  sensitive=True,
  morecomment=[s]{(\*}{\*)},
}

\lstset{
    language=isabelle,
    mathescape=true,
    escapeinside={--"}{"},
    basicstyle={\itshape},
    keywordstyle=\rm\bfseries,
    keywordstyle=[2]\rm\bfseries,
    keywordstyle=[3]\rm,
    keywordstyle=[4]\rm,
    showstringspaces=false,
    keepspaces=true,
    columns=[c]fullflexible}
\lstset{literate=
  {"}{}0
  {'}{{${}^\prime$}}1
  {\%}{{$\lambda$}}1
  {\\\%}{{$\lambda$}}1
  {\\\$}{{$\mathbin{\,\$\,}$}}1
  {->}{{$\rightarrow$}}1
  {<-}{{$\leftarrow$}}1
  {<.}{{$\langle$}}1
  {.>}{{$\rangle$}}1
  {<=}{{$\le$}}1
  {<->}{{$\leftrightarrow$}}1
  {-->}{{$\longrightarrow$}}2
  {<-->}{{$\longleftrightarrow$}}1
  {=>}{{$\Rightarrow$}}1
  {==}{{$\equiv$}}2
  {==>}{{$\implies$}}2
  {<=>}{{$\Leftrightarrow$}}1
  {~=}{{$\ne$}}1
  {!!}{{$\bigwedge$}}1
  {(}{{$($}}1
  {)}{{$)$}}1
  {\{}{{$\{$}}1
  {\}}{{$\}$}}1
  {[}{{$[$}}1
  {]}{{$]$}}1
  {(|}{{$\lrec$}}1
  {|)}{{$\rrec$}}1
  {[|}{{$\lsem$}}1
  {|]}{{$\rsem$}}1
  {|}{{$|$}}1
  {\\<lbrakk>}{{$\lsem$}}1
  {\\<rbrakk>}{{$\rsem$}}1
  {|-}{{$\vdash$}}1
  {|->}{{$\mapsto$}}1
  {|_|}{{$\bigsqcup$}}1
  {...}{{$\dots$}}1
  {\\x}{{$\times$}}1
  {_0}{{${}_0$}}1
  {_1}{{${}_1$}}1
  {_2}{{${}_2$}}1
  {_3}{{${}_3$}}1
  {_4}{{${}_4$}}1
  {_5}{{${}_5$}}1
  {_6}{{${}_6$}}1
  {_7}{{${}_7$}}1
  {_8}{{${}_8$}}1
  {_9}{{${}_9$}}1
  {^*}{{$^*$}}1
  {\\<^sup>*}{{$^*$}}1
  {\\<^sub>*}{{$_*$}}1
  {\\<^sub>A}{{$_A$}}1
  {\\<^sub>r}{{$_r$}}1
  {\\<^sub>a}{{$_a$}}1
  {\\<^sub>t}{{$_t$}}1
  {:_i}{{$:_i$}}1
  {\\<A>}{{$\mathcal{A}$}}1
  {\\<O>}{{\sf o}}1
  {\\<Phi>}{{$\Phi$}}1
  {\\<Psi>}{{$\Psi$}}1
  {\\<sigma>}{{$\sigma$}}1
  {\\<in>}{{$\in$}}1
  {\\<and>}{{$\wedge$}}1
  {\\<le>}{{$\le$}}1
  {\\<noteq>}{{$\ne$}}1
  {\\<lambda>}{{$\lambda$}}1
  {\\<longrightarrow>}{{$\longrightarrow$}}1
  {\\<longleftrightarrow>}{{$\longleftrightarrow$}}1
  {\\<Rightarrow>}{{$\Rightarrow$}}1
  {\\<Longrightarrow>}{{$\Longrightarrow$}}1
  {\\<rightarrow>}{{$\rightarrow$}}1
  {\\<leftarrow>}{{$\leftarrow$}}1
  {\\<mapsto>}{{$\mapsto$}}1
  {\\<equiv>}{{$\equiv$}}1
  {\\<and>}{{$\and$}}1
  {\\<or>}{{$\vee$}}1
  {\\<And>}{{$\bigwedge$}}1
  {\\<Up>}{{$\Uparrow$}}1
  {\\<Down>}{{$\Downarrow$}}1
  {\\<up>}{{$\uparrow$}}1
  {\\<down>}{{$\downarrow$}}1
  {\\<times>}{{$\times$}}1
  {\\<forall>}{{$\forall$}}1
  {\\<exists>}{{$\exists$}}1
  {\\<union>}{{$\cup$}}1
  {\\<Union>}{{$\bigcup$}}1
  {\\<inter>}{{$\cap$}}1
  {\\<subset>}{{$\subset$}}1
  {\\<subseteq>}{{$\subseteq$}}1
  {\\<supset>}{{$\supset$}}1
  {\\<supseteq>}{{$\supseteq$}}1
  {\\<alpha>}{{$\alpha$}}1
  {\\<beta>}{{$\beta$}}1
  {\\<gamma>}{{$\gamma$}}1
  {\\alpha}{{$\alpha$}}1
  {\\beta}{{$\beta$}}1
  {\\gamma}{{$\gamma$}}1
  {\\<Gamma>}{{$\Gamma$}}1
  {\\<langle>}{{$\langle$}}1
  {\\<rangle>}{{$\rangle$}}1
  {\\<not>}{{$\neg$}}1
  {\\<notin>}{{$\notin$}}1
  {\\<guillemotright>}{{$\gg$}}1
  {\\in}{$\in$}1
  {\\and}{$\wedge$}1
  {\\or}{$\vee$}1
  {\\Phi}{{$\Phi$}}1
  {\\Psi}{{$\Psi$}}1
  {\\le}{{$\le$}}1
  {\\Up}{{$\Uparrow$}}1
  {\\Down}{{$\Down$}}1
  {>>}{{$\gg$}}1
  {>>=}{{${\gg}{=}$}}1
  {<*lex*>}{{$\times_{\sf lex}$}}1
}

\newcommand{\isai}{\lstinline[language=isabelle]}

\DeclareMathOperator{\bplustreeassn}{bplustree_A}

%\pdfoutput=1 %uncomment to ensure pdflatex processing (mandatatory e.g. to submit to arXiv)
%\hideLIPIcs  %uncomment to remove references to LIPIcs series (logo, DOI, ...), e.g. when preparing a pre-final version to be uploaded to arXiv or another public repository

\graphicspath{{./figures/}}%helpful if your graphic files are in another directory

\bibliographystyle{plainurl}% the mandatory bibstyle

\title{A Verified Implementation of B$^+$-trees in Isabelle/HOL}

%\titlerunning{Dummy short title} %TODO optional, please use if title is longer than one line

\author{Niels Mündler}{Department of Computer Science, ETH Zurich, Switzerland}{nmuendler@ethz.ch}{https://orcid.org/0000-0003-3851-2557}{}%TODO mandatory, please use full name; only 1 author per \author macro; first two parameters are mandatory, other parameters can be empty. Please provide at least the name of the affiliation and the country. The full address is optional. Use additional curly braces to indicate the correct name splitting when the last name consists of multiple name parts.

\author{Tobias Nipkow}{Department of Informatics, Technical University of Munich, Germany}{nipkow@in.tum.de}{https://orcid.org/0000-0003-0730-515X}{}

%\author{Peter Lammich}{Department of Computer Science, The University of Manchester, Great-Britain}{lammich@in.tum.de}{https://orcid.org/0000-0003-3576-0504}{}

\authorrunning{N. Mündler and T. Nipkow} %TODO mandatory. First: Use abbreviated first/middle names. Second (only in severe cases): Use first author plus 'et al.'

\Copyright{Niels Mündler} %TODO mandatory, please use full first names. LIPIcs license is "CC-BY";  http://creativecommons.org/licenses/by/3.0/

\begin{CCSXML}
    <ccs2012>
       <concept>
           <concept_id>10003752.10003790.10011742</concept_id>
           <concept_desc>Theory of computation~Separation logic</concept_desc>
           <concept_significance>500</concept_significance>
           </concept>
     </ccs2012>
\end{CCSXML}
    
\ccsdesc[500]{Theory of computation~Separation logic}
%TODO mandatory: Please choose ACM 2012 classifications from https://dl.acm.org/ccs/ccs_flat.cfm 

\keywords{Separation Logic, Verification, Refinement} %TODO mandatory; please add comma-separated list of keywords

\category{} %optional, e.g. invited paper

\relatedversion{} %optional, e.g. full version hosted on arXiv, HAL, or other respository/website
%\relatedversiondetails[linktext={opt. text shown instead of the URL}, cite=DBLP:books/mk/GrayR93]{Classification (e.g. Full Version, Extended Version, Previous Version}{URL to related version} %linktext and cite are optional

%\supplement{}%optional, e.g. related research data, source code, ... hosted on a repository like zenodo, figshare, GitHub, ...
%\supplementdetails[linktext={opt. text shown instead of the URL}, cite=DBLP:books/mk/GrayR93, subcategory={Description, Subcategory}, swhid={Software Heritage Identifier}]{General Classification (e.g. Software, Dataset, Model, ...)}{URL to related version} %linktext, cite, and subcategory are optional

%\funding{(Optional) general funding statement \dots}%optional, to capture a funding statement, which applies to all authors. Please enter author specific funding statements as fifth argument of the \author macro.

\acknowledgements{I want to thank Peter Lammich for giving me a very comprehensive introduction
separation logic and discussing the many ways to express assertions.
I further want to thank Manuel Eberl for his immediate and impressive support 
with Isabelle proof tactics.}%optional

%\nolinenumbers %uncomment to disable line numbering


%Editor-only macros:: begin (do not touch as author)%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\EventEditors{Tobias Nipkow, Larry Paulson and Makarius Wenzel}
\EventNoEds{3}
\EventLongTitle{Isabelle Workshop 2022}
\EventShortTitle{IWS 2022}
\EventAcronym{IWS}
\EventYear{2022}
\EventDate{August 11, 2022}
\EventLocation{Haifa, Israel}
\EventLogo{}
\SeriesVolume{}
\ArticleNo{2}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand{\btree}{B$^+$-tree}
\newcommand{\btrees}{B$^+$-trees}

\begin{document}

\maketitle

%TODO mandatory: add short abstract of the document
\begin{abstract}
    In this paper we present the verification of an imperative
    implementation of the ubiquitous \btree\ data structure in the
    interactive theorem prover Isabelle/HOL. The implementation supports
    membership test, insertion and range queries with efficient binary
    search for intra-node navigation. The imperative implementation is
    verified in two steps: an abstract set interface is refined to an
    executable but inefficient purely functional implementation which is further
    refined to the efficient imperative implementation.
\end{abstract}

\section{Introduction}
\label{sec:introduction}

\btrees\ form the basis of virtually all modern relational database management systems (RDBMS)
and file systems.
Even single-threaded databases are non-trivial to analyse and verify,
especially machine-checked.
Meanwhile it is important to verify various properties like functional correctness,
termination and runtime,
since RDBMS are ubiquitous and employed in critical contexts,
like the banking sector and realtime systems.
The only work in the literature on that topic that we are aware of
is the work by Malecha \emph{et al.} \cite{DBLP:conf/popl/MalechaMSW10}.
However, it lacks the commonly used range
query operation, which returns a pointer to the lower bound of a given value
in the tree
and allows to iterate over all successive values.
This operation is particulary challenging to verify as it requires
to mix two usually strictly separated abstractions of the tree
in order to reason about its correctness.
We further generalize the implementation of node internal
navigation.
This allows to abstract away from its implementation
and simplifies proofs.
It further allows us to supply an implementation of
efficient binary search, a practical and widespread runtime improvement
as nodes usually have a size of several kilobytes.
We provide a computer assisted proof in the interactive
theorem prover Isabelle/HOL \cite{DBLP:books/sp/NipkowK14} for the functional
correctness of an imperative implementation of the \btree\ data-structure
and present how we dealt with the resulting technical verification challenges.


\section{Contributions}

In this work, we specify the \btree\ data structure in the
functional modeling language higher-order logic (HOL).
The tree is proven to refine a finite set of linearly ordered elements.
All proofs are machine-checked in the Isabelle/HOL framework.
Within the framework,
the functional specification already yields automatic extraction of executable,
but inefficient code.

 The contributions of this work are as follows
\begin{itemize}
   \item The first verification of genuine range queries,
         which require additional insight in refinement over iterating over the whole tree.
   \item The first efficient intra-node navigation based on binary rather than linear search.
\end{itemize}

The remainder of the paper is structured as follows.
In \cref{sec:introduction}, we present a brief overview on related
work and introduce the definition of \btree\ used in our approach.
In \cref{sec:set,sec:range},
we refine a functionally correct, abstract specification of
point, insertion and range queries as well as iterators
down to efficient imperative code.
Finally, we present learned lessons and evaluate the results
in \cref{sec:conclusion}.

The complete source code of the implementation referenced in this research
is accessible via GitHub\footnote{\url{https://github.com/nielstron/bplustrees}}.

\subsection{Related Work}
\label{sec:related_work}

There exist two pen and paper proofs via a rigorous formal approach.
Fielding \cite{Fielding80} uses gradual refinement of abstract
implementations.
Sexton and Thielecke \cite{DBLP:journals/entcs/SextonT08} show how to use 
separation logic in the verification.
These are more of a conceptual guideline on approaching a fully machine checked proof.

There are two machine checked proofs of imperative implementations.
In the work of Ernst \emph{et al.} \cite{DBLP:journals/sosym/ErnstSR15},
an imperative implementation is directly verified
by combining interactive theorem proving in KIV \cite{ReifKIV}
with shape analysis using TVLA \cite{DBLP:journals/toplas/SagivRW02}.
The implementation lacks shared pointers between leaves.
This simplifies the proofs about tree invariants.
However, the tree therefore also lacks iterators over the leaves,
and the authors present no straightforward solution to implement them.
Moreover, by directly verifying an imperative version only,
it is likely that small changes in the implementation will
break larger parts of the proof.

Another direct proof on an imperative implementation
was conducted by Malecha~\emph{et~al.}~\cite{DBLP:conf/popl/MalechaMSW10}, with the Ynot
extension to the interactive theorem prover Coq.
Both works use recursively defined shape predicates
that describe formally how the nodes and pointers
represent an abstract tree of finite height.
The result is both a fairly abstract specification of a \btree,
that leaves some design decisions to the impertive implementation,
and an imperative implementation that supports
iterators.

Due to the success of this approach,
we follow their example and define these predicates functionally.
One example of the benefits of this approach is that we were able
to derive finiteness and acyclicity only from the
relation between imperative and functional specification.
In contrast to previous work, the functional predicates describing the tree shape are kept
completely separated from the imperative implementation,
yielding more freedom for design choices within the imperative refinement.
Both existing works rely on linear search for intra-node navigation,
which we improve upon by providing binary search.
We extend the extraction of an iterator
by implementing an additional range query operation. 


\section{\btrees\ and Approach}
\label{sec:approach}


The \btree\ is a ubiquitous data structure to efficiently retrieve and manipulate
indexed data stored on storage devices with slow memory access \cite{DBLP:journals/csur/Comer79}.
They are $k$-ary balanced search trees, where $k$ is a free parameter.
We specify them as implementing a set interface,
where all elements in the leaves comprise the content of an abstract set.
The inner nodes only contain separators instead of the set content.
These separators have the same type as the set content,
but are only used to guide the recursive navigation through the tree
by bounding all set values in the neighboring subtrees.
Further the leaves usually contain pointers
to the next leaf, allowing for efficient iterators and range queries.
A more formal and detailed outline of \btrees\ can be found in \cref{sec:data_structure_defs}.

The goal of this work is to define this data structure
and implement and verify efficient heap-based imperative operations on them.
For this purpose, we introduce a functional, algebraic definition and
specify all invariants on this level that can naturally be expressed in the algebraic domain.
It is important to note that this representation is not complete,
as aliased pointers are left out on the algebraic level.
However, important structural invariants, such as sortedness and balancedness
can be verified.

In a second step an imperative definition is introduced,
that takes care of the refinement of lists to arrays in the heap
and introduces (potentially shared) pointers instead of algebraic structures.
Using a refinement relationship, we can prove that an imperative refinement
of the functional specification preserves the structural invariants
of the imperative tree on the heap.
The only remaining proof obligation on this level is to ensure the correct linking
between leaf pointers.

The above outlined steps are performed via manual refinement in Imperative HOL \cite{DBLP:conf/tphol/BulwahnKHEM08}.
We build on the library of verified imperative utilities
provided by the Separation Logic Framework \cite{DBLP:journals/afp/LammichM12}
and the verification of B-Trees \cite{DBLP:journals/afp/Mundler21},
namely list interfaces and partially filled arrays.
The implementation is defined with respect to an abstract imperative
operation for node-internal navigation.
This means that within each node, we do not specify
how the correct subtree for recursive queries is found,
but only constrain some characteristics of the result.
We provide one such operation that employs linear search,
and one that conducts binary search.
All imperative programs are shown to refine the functional specifications
using the separation logic utilities from the Isabelle Refinement Framework by
Lammich \cite{DBLP:journals/jar/Lammich19}.

\subsection{Notation}

Isabelle/HOL conforms to everyday mathematical notation for the most part.
For the benefit of the reader who is unfamiliar with Isabelle/HOL, we establish
notation and in particular some essential datatypes together with their primitive
operations that are specific to Isabelle/HOL. We write \textit{t :: 'a} to specify that
the term \textit{t} has the type \textit{'a} and \textit{'a $\Rightarrow$ 'b}
for the type of a total function from \textit{'a} to \textit{'b}.
The type for natural numbers is \textit{nat}.
Sets with elements of type \textit{'a} have the type \textit{'a set}.
Analogously, we use \textit{'a list} to describe lists, which are constructed as the empty
list \textit{[]} or with the infix constructor \textit{\#}, and are appended with the infix operator
\textit{@}. The function \textit{concat} concatenates a list of lists.
The function \textit{set} converts a list into a set. For optional values, Isabelle/HOL
offers the type \textit{option} where a term \textit{opt :: 'a option} is either \textit{None} or \textit{Some a}
with \textit{a :: 'a}.

\subsection{Definitions}
\label{sec:data_structure_defs}

% TODO shorten

We first define an algebraic version of \btrees.
Proofs about the correctness of operations and the preservation of invariants
are only done on the abstract level, where they are much simpler
and many implementation details can be disregarded.
It will serve as a reference point for the efficient
imperative implementation.

The algebraic \btree\ is defined as follows:

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-def]
datatype 'a bplustree =
    Leaf ('a list) |
    Node (('a bplustree \<times> 'a ) list) ('a bplustree)
\end{lstlisting}


\begin{figure}
    \centering
    \includegraphics[width=0.5\linewidth]{btree-basic-nopair.pdf}
    \caption[Visualization of a \btree]
    {Nodes contain several elements, the internal list/array structure is not depicted.
    The dotted lines represent links to following leaf nodes that are not present in the algebraic formulation.}
    \label{fig:btree-basic}
\end{figure}


Every node \emph{Node} [($t_1$,$a_1$), …, ($t_n$,$a_n$)] $t_{n+1}$ contains an interleaved list of \textit{keys} $a_i$ and \textit{subtrees} $t_i$.
We write as $t_i$ the subtree to the left of $a_i$ and
$t_{i+1}$ the subtree to the right of $a_i$.
We refer to $t_{n+1}$ as the \textit{last} subtree.
The leaves \emph{Leaf} [$v_1$, …, $v_n$] contain a list of \textit{values} $v_i$.
Separators are only used for navigation within the tree.
The concatenation of lists of values of a tree $t$ yields
all elements contained in the tree. We refer to this list as \emph{leaves t}.
A \btree\ with above structure must fulfill the invariants
\textit{balancedness}, \textit{order} and \textit{alignment}.

\textit{Balancedness} requires
that each path from the root to a leaf has the same length.
In other words, the height of all trees in one level of the tree must be equal,
where the height is the maximum path length to a leaf.

The \textit{order} property ensures a minimum and maximum
number of subtrees for each node.
A \btree\ is of order $k$, if each internal node has at least $k+1$
subtrees and at most $2k+1$.
The root is required to have a minimum of 2 and a maximum of $2k+1$ subtrees.
We require that $k$ be strictly positive, as for $k = 0$ the requirements on the tree
root are contradictory.

\textit{Alignment} means that keys are sorted with respect to separators:
For a separator $k$ and all keys $l$ in the subtree to the left, $l < k$,
and all keys $r$ in the subtree to the right, $k \leq r$.
(where $\leq$ and $<$ can be exchanged).
Specifically we require for a tree $t$ that \emph{Laligned t $\top$},
where \emph{Laligned} is defined as in \cref{fig:btree-alignment-def}
and $\top$ is the top element of the linear order. 

\begin{figure}
\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-alignment-def]
fun inbetween where
  inbetween f l [] t u = f l t u |
  inbetween f l ((sub,sep)#xs) t u = (f l sub sep $\wedge$ inbetween f sep xs t u)

fun aligned where
  aligned l (Leaf ks) u = (l < u $\wedge$ ($\forall$x $\in$ set ks. l < x $\wedge$ x $\le$ u)) |
  aligned l (Node ts t) u = (inbetween aligned l ts t u)

fun Laligned where
  "Laligned (LNode ks) u = (\<forall>x \<in> set ks. x \<le> u)" |
  "Laligned (Node ts t) u = (case ts of
      [] \<Rightarrow> Laligned t u |
      (sub,sep)#ts' \<Rightarrow> (Laligned sub sep) $\wedge$ inbetween aligned sep ts' t u
    )"
  
\end{lstlisting}
\caption{Definition of the alignment property.}
\label{fig:btree-alignment-def}
\end{figure}


For the values within the leaves, \textit{sortedness} is required explicitly.
We require the even stronger fact that \emph{leaves t} is sorted.
This is a useful statement when arguing about the correctness of set operations.

All of these mentioned invariants are proven to be maintained by the abstract set operations.
While these abstract operations already yields executable code,
they are not translated into particularly efficient code.

The efficient implementation of \btrees\ is defined
on the imperative level.
Each imperative node contains pointers (\emph{ref}) rather than the full subtree.
We refine lists with partially filled arrays of capacity $2k$.
A partially filled array $(a,i)$ with capacity $c$ is an array $a$ of fixed size $c$.
Only the first $i$ elements are considered content of the array.
Unlike dynamic arrays, partially filled arrays are not expected to grow or shrink.
This way, the data structures are refined to an imperative level,
each imperative node contains the equivalent information to an abstract node.
The only addition is that leaves now also contain a pointer to another leaf,
which will form a linked list over all leaves in the tree.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-imp-def]
datatype 'a btnode =
  Btnode (('a btnode ref option \<times> 'a) pfarray) ('a btnode ref) |
  Btleaf ('a pfarray) ('a btnode ref option)
\end{lstlisting}

It is possible to modify elements on the heap and share pointers with this setup.
In order to use the algebraic data structure as a reference point,
we introduce a refinement relation.
The correctness of operations on the imperative node
can then be shown by relating imperative input and output
and to the abstract input and output of a correct abstract operation.
In particular we want to show that if we assume \emph{R t $t_i$},
where $R$ is the refinement relation and $t$ and $t_i$ are the abstract
and the imperative version of the "same" tree,
\emph{R o(t) $o_i$($t_i$)} should hold, where $o_i$ is the imperative refinement
of operation $o$.
The relation is expressed as a separation logic formula that links an abstract tree to its
imperative equivalent.

The notation for separation logic in Isabelle is quickly summarized in the list below.
\begin{itemize}
    \item \textit{emp} holds for the empty heap
    \item \textit{true} and \textit{false} hold for every and no heap respectively
    \item $\uparrow(P)$ holds if the heap is empty and predicate $P$ holds
    \item $a \mapsto_r x$ holds if the heap at location $a$ is reserved and contains
    value $x$
    \item $\exists_A x.\ P\ x$ holds if there exists some $x$ such that $P x$
    holds on the heap.
    \item $P_1 * P_2$ denotes the separating conjunction and holds if each assertion $P_1$ and $P_2$ hold on non-overlapping parts
    of the heap
    \item \emph{is\_pfa c xs xsi} expresses that $xsi$ is a partially filled array
    with capacity $c$ that refines the list $xs$.
    \item \emph{list\_assn P xs ys} expresses that \emph{P xs[i] ys[i]} holds for all $i \leq |xs| = |ys|$.
\end{itemize}
% TODO introduce notation of sep logic
Separation Logic formulae always express the state of some heap.
The assertion $P$ describes all heaps for which the formula $P$ evaluates to true.
The entailment $P \Longrightarrow_A Q$ holds iff $Q$ holds in every heap in which $P$ holds.
$P = Q$ holds iff $P \Longrightarrow_A Q \wedge Q \Longrightarrow P$.
The formulas are usually used in the context of Hoare triples.
We write $<P> c <\lambda r.\ Q\ r>$ if, for any heap where $P$ holds, after executing
imperative code $c$ that returns value $r$, formula $Q\ r$ holds on the resulting heap.
$<P> c <\lambda r.\ Q\ r>_t$ is a shorthand for $<P> c <\lambda r.\ Q\ r * \mathit{true}>$
More details can be found in the work of Lammich and Meis \cite{DBLP:journals/afp/LammichM12}.

The assertion \emph{bplustree\_assn} expressing the refinement relation
 relates an algebraic tree (\emph{bplustree})
and an imperative tree (\emph{btnode ref}), as well as the first and last leaf of the imperative tree.
The formal relation is shown in \cref{fig:btree-assn}.

\begin{figure}
   \centering 
\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-relation]
fun bplustree_assn :: nat $\Rightarrow$ 'a bplustree $\Rightarrow$ 'a btnode ref $\Rightarrow$ 'a btnode ref $\Rightarrow$ 'a btnode ref
    where
  bplustree_assn k (LNode xs) a r z =
  $\exists_A$ xsi fwd.
      a $\mapsto_r$ Btleaf xsi fwd
    * is_pfa (2*k) xs xsi
    * $\uparrow$(fwd = z)
    * $\uparrow$(r = Some a)
    |
  bplustree_assn k (Node ts t) a r z =
  $\exists_A$ tsi ti tsi' rs.
      a $\mapsto_r$ Btnode tsi ti
    * is_pfa (2*k) tsi' tsi
    * $\uparrow$(length tsi' = length rs)
    * list_assn (($\lambda$ t (ti,r',z'). bplustree_assn k t (the ti) r' z') $\times_a$ id_assn) ts (
        zip (zip (map fst tsi') (zip (butlast (r#rs)) rs))) (map snd tsi')))
    * bplustree_assn k t ti (last (r#rs)) z)
\end{lstlisting}
    \caption[Assertion describing the imperative \btree]{
        The \btree\ is specified by the split factor $k$, an abstract tree,
        a pointer to its root, a pointer to its first leaf and a pointer
        to the first leaf of the next sibling.
        The pointers to first leaf and next first leaf are used
        to establish the linked leaves invariant.
    }
    \label{fig:btree-assn}
\end{figure}

The main structural relationship between abstract and imperative tree
is established by linking abstract list and array via the \textit{is\_pfa} predicate,
and recursively linking the abstract subtrees and subtree pointers
inside the \textit{list\_assn}.

In addition to the refinement relation,
the first and last leaf \textit{r} and \textit{z} are used to express the structural invariant
that the leaves are correctly linked.
This property is required for the iterator on the tree in \cref{sec:imperative_iter}.
The structural invariant is ensured
by passing the first leaf of the right neighbor to each subtree.
We can not explicitly retreive these leaves from the tree structure.
The reason is that any functions that follow the pointers of the tree are not guaranteed
to terminate without the context of the structural soundness of the tree,
which is only established within the refinement relation.
Instead, we assume that there exists a list of such leaf pointers \textit{rs}.
We ensure that this list is the correct one by passing the supposedly
first leaves into each subtree.
The pointer is passed recursively to the leaf node,
where it is compared to the actual pointer of the leaf.
All of this happens in the convoluted \textit{list\_assn}, by
folding over the list of the leaf pointer list \textit{rs} zipped with itself, offset by one.

There is no abstract equivalent for the next pointers in the leaves,
therefore we can only introduce and reason about this invariant on the imperative layer.
Due to the constraints of separation logic, we cannot express this invariant
in a separate statement from the refinement relation.
We need to access the elements in each node to ensure the refinement relation,
and in this step we also access the memory that contains the next pointers.
Since separation logic only permits us to access the memory location
exclusively in each term separated by the separating conjunction, this single access 
must cover all invariants.


\subsection{Node internal navigation}
\label{sec:split}

In order to define meaningful operations that navigate
the node structure of the \btree,
we need to find a method that handles search within a node.
% For general $k$-ary \btrees\ there have not been sophisticated search strategies.
Ernst \emph{et al.} \cite{DBLP:journals/sosym/ErnstSR15} and Malecha~\emph{et~al.}~\cite{DBLP:conf/popl/MalechaMSW10}
both use a linear search through the key and value lists.
However, \btrees\ are supposed to have memory page sized nodes \cite{DBLP:journals/csur/Comer79}, 
which makes a linear search impractical.

We introduce a context (\emph{locale} in Isabelle) in which we assume that we
have access to a function that correctly navigates through the node internal structure.
We call this function \emph{split}, and define it only by its behavior.
Given a list of separator-subtree pairs and a search value $x$, the function should return the pair $(s,t)$ such that,
according to the structural invariant of the \btree, $t$ must contain $x$ or will hold $x$ after a correct insertion.
A corresponding function \emph{split\_list} is defined on the separator-only lists in the leaf nodes.
The formal specification for \emph{split} is given in \cref{fig:split-def}.

\begin{figure}
   \centering 
\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:split-def]
locale split_tree =
    fixes split ::  "('a bplustree \<times> 'a) list \<Rightarrow> 'a \<Rightarrow> (('a bplustree \<times> 'a) list 
    "split xs p = (ls,rs) \<Longrightarrow> xs = ls @ rs" 
    "split xs p = (ls@[(sub,sep)],rs); sorted_less (separators xs) \<Longrightarrow> sep < p" 
    "split xs p = (ls,(sub,sep)#rs); sorted_less (separators xs) \<Longrightarrow> p \<le> sep" 

\end{lstlisting}
    \caption[Definition of \emph{split}]{
        Given a list of separator-subtree pairs and a search value $x$, the function should return the pair $(s,t)$ such that,
        according to the structural invariant of the \btree, $t$ must contain $x$ or will hold $x$ after a correct insertion.
    }
    \label{fig:split-def}
\end{figure}

In the following sections, all operations are defined and verified
based on \emph{split} and \emph{split\_list}.
Finally, when approaching imperative code extraction,
we provide a binary search based function, that refines \emph{split}.
This binary search is directly implemented and verified on the imperative
level and is eventually plugged into the abstractly defined
imperative operations on the \btree.
Thus we obtain imperative code that makes use of an efficient
binary search, without adding complexity to the proofs.
The definition and implementation closely follows
the approach described in detail in the
verification of B-Trees \cite{DBLP:journals/afp/Mundler21}.
% TODO copy from BSc. Thesis?


\section{Set operations}
\label{sec:set}

\btrees\ refine sets on linearly ordered elements.
For a tree $t$, the refined abstract set is computed as \emph{set (leaves t)}.
The set interface requires that there should be query, insertion and deletion
operations $o_t$ such that \emph{set (leaves ($o_t$ t)) = o (set (leaves t)}.
Moreover, the invariants described in \cref{sec:approach}
can be assumed to hold for $t$ and are required for $o_t$.
We provide these operations and show their correctness on the functional
layer first, then refine the operations further to the imperative
layer.
For point queries and insertion, we follow the implementation
suggested by Bayer and McCreight \cite{DBLP:journals/acta/BayerM72}.

\subsection{Functional Point Query}
\label{sec:functional_pq}

For an inner node $t$ and a searched value $x$, find the correct subtree $s_t$
such that if a leaf of $t$ contains $x$, a leaf of $s_t$ must contain $x$.
Then recurse on $s_t$.
Inside the leaf node, we search directly in the list of values.
Note that we assume here that a \textit{split} and \textit{isin\_list} operation exist,
as described in \cref{sec:split}.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:isin-def]
fun isin:: "'a bplustree $\Rightarrow$ 'a $\Rightarrow$ bool" where
  isin (LNode ks) x = (isin_list x ks) |
  isin (Node ts t) x = (case split ts x of
     (_,(sub,sep)#rs) $\Rightarrow$ isin sub x
   | (_,[]) $\Rightarrow$ isin t x
  )
\end{lstlisting}

Since this function does not modify the tree involved at all,
we only need to show that it returns the correct value.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:isin-set-inorder]
theorem assumes "sorted_less (leaves t)" and "aligned l t u" 
  shows "isin t x = (x $\in$ set (leaves t))"
\end{lstlisting}

In general, these proofs on the abstract level are 
based on yet another refinement relation suggested by Nipkow~\cite{DBLP:conf/itp/Nipkow16}. 
We say that the \btree\ $t$ refines a sorted list of its leaf values, \emph{leaves t}.
We argue that recursing into a specific subtree
is equivalent to splitting this list at the correct position
and searching in the correct sublist.
The same approach was applicable for proving the correctnes of functional
operations on B-Trees \cite{DBLP:journals/afp/Mundler21}.

The proofs on the functional level can therefore be made concise.
We go on and define an imperative version of the operation that
refines each step of the abstract operation to equivalent operations on the imperative tree.

\subsection{Imperative Point Query}
\label{sec:imperative_pq}

The imperative version of the point query is a partial function.
Termination cannot be guaranteed anymore,
at least without further assumptions.
This is inevitable since the function would not terminate
given cyclic trees.
However, we will show that if the input refines an abstract tree,
the function terminates and is correct.
The imperative \emph{isin} refines each step of the abstract
operation with an imperative equivalent.
The result can be seen in \cref{fig:isin-imp-def}.

\begin{figure}
    \centering
\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:isin-imp-def]
partial_function (heap) isin :: "'a btnode ref $\Rightarrow$ 'a $\Rightarrow$  bool Heap" where
  isin p x = do {
  node $\leftarrow$ !p;
  (case node of
     Btleaf xs _ $\Rightarrow$ imp_isin_list x xs |
     Btnode ts t $\Rightarrow$ do {
       i $\leftarrow$ imp_split ts x;
       tsl $\leftarrow$ length ts;
       if i < tsl then do {
         s $\leftarrow$ get ts i;
         let (sub,sep) = s in
           isin (the sub) x
       } else
           isin t x
    }
)}
\end{lstlisting}
\caption[Definition of imperative \emph{isin}]{
    The imperative definition of the \emph{isin} function.
}
\label{fig:isin-imp-def}
\end{figure}

Again, we assume that \emph{imp\_split} performs the correct node internal search
and refines an abstract \emph{split}.
Note how \emph{imp\_split} does not actually split
the internal array, but rather returns the index of the pair
that would have been returned by the abstract split function.
The pattern matching against an empty list
is replaced by comparing the index to the length of the list $l$.
In case the last subtree should be recursed into, the whole list $l$ is returned.

In order to show that the function returns the correct result,
we show that it performs the same operation on the imperative tree
as on the algebraic tree.
This is expressed in Hoare triple notation and separation logic.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:isin-refines]
lemma assumes "k > 0" and "root_order k t" and "sorted_less (inorder t)"
   and "sorted_less (leaves t)" shows
   "<bplustree_assn k t ti r z>
     isin ti x
   <$\lambda$y. bplustree_assn k t ti r z * $\uparrow$(isin t x = y)>$_t$"
\end{lstlisting}

The proof follows inductively on the structure of the abstract tree.
Assuming structural soundness of the abstract tree refined by the pointer passed in,
the returned value is equivalent to the return value of the abstract function.
We must explicitly show that the tree on the heap
still refines the same abstract tree after the operation,
which was implicit on the abstract layer.
It follows directly, since no operation in the imperative
function modifies part of the tree.

\subsection{Insertion and Deletion}
\label{sec:insert_delete}

The insertion operation and its proof of correctness largely line up with the one for point queries.
But since insertion modifies the tree,
we need to additionally show on the abstract level that the modified tree
maintains the invariants of \btrees.

On the imperative layer, we show that the heap state
after the operation refines the tree
after the abstract insertion operation.
It follows that the imperative operation
also maintains the abstract invariants.
Moreover, we need to show that the linked list 
among the leaf pointers is correctly maintained throughout the operation.
This can only be shown on the imperative level as there is no abstract equivalent
to the shared pointers.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:insert-refines]
lemma assumes "k > 0" and "sorted_less (inorder t)"
    and "sorted_less (leaves t)" and "root_order k t" shows
    "<bplustree_assn k t ti r z>
    imp_insert k x ti
    <\<lambda>u. bplustree_assn k (insert k x t) u r z>$_t$"
\end{lstlisting}
  

We provide a verified functional definition of deletion and a definition of an imperative refinement.
Showing the correctness of the imperative version would largely follow
the same pattern as the proof of the correctness of insertion.
The focus of this work is not on basic tree operations
however, but on obtaining an iterator view on the tree.


\section{Range operations}
\label{sec:range}

This section introduces both how the general iterator
on the tree leaves is obtained and the technical challenges involved
(\cref{sec:imperative_iter})
as well as how to obtain an iterator on a specific
subset of elements efficiently (\cref{sec:imperative_range}).

On the functional level, the forwarding leaf pointers in each leaf
are not present, as this would require aliasing.
Therefore, the abstract equivalent of an iterator
is a concatenation of all leaf contents.
%Range iterators then concatenate the correct parts of the abstract tree.
When refining the operations, we will make use of the leaf pointers
to obtain an efficient implementation.

\subsection{Iterators}
\label{sec:imperative_iter}

The implementation of the leaf iterator is straightforward.
We recurse down the tree to obtain the first leaf.
From there we follow leaf
pointers along the fringe of the tree until we reach the final leaf marked by a null next pointer.
However, from an assertion perspective the situation is more intricate.
It is important to find an explicit formulation of the linked list view on the leaf pointers.
Meanwhile, we want to maintain enough information about the remainder of the tree
to be able to state that the complete tree does not change by iterating through the leaves.
We cannot express an assertion about the linked list along the leaves
and the assertion on the whole tree in two independent predicates,
as separation logic forces us to not make statements about the contents of
any memory location twice.
This is an important feature of separation logic,
in order to keep the parts of the heap disjoint and
thus be able to locally reason about the heap state.

For this, we follow the approach of Malecha \emph{et al.} \cite{DBLP:conf/popl/MalechaMSW10} and
try to find an equivalent formulation that separates the whole tree in a
view on its inner nodes and the linked leaf node list.
The central idea to separate the tree is to
express that the linked leaf nodes refine \emph{leaf\_nodes t}
and that the inner nodes refine \emph{trunk t}, as depicted in \cref{fig:btree-view-split}.
These are two independent parts of the heap and therefore
the statements can be separated using the separating conjunction.

Formally, we define an assertion \emph{trunk\_assn} and \emph{leaf\_nodes\_assn}.
The former is the same as \emph{bplustree\_assn} (see \cref{fig:btree-assn}),
except that we remove all assertions about the content of the tree in the \emph{LNode} case.
The latter is defined similar to a linked list refining a list of abstract tree leaf nodes,
shown in \cref{fig:leaf-nodes-assn}.
The list is refined by a pointer to the head of the list,
which refines the head of the abstract list.
Moreover, the imperative leaf contains a pointer to the next element in the list.

With these definitions, we can show that the heap describing the imperative tree may be
split up into its leaves and the trunk.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-view-split-oneway]
lemma "bplustree_assn k t ti r z $\Longrightarrow_A$ leaf_nodes_assn k (leaf_nodes t) r z * trunk_assn k t ti r z"
\end{lstlisting}

\begin{figure}
    \centering
\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:leaf-nodes-assn]
fun leaf_nodes_assn where
  "leaf_nodes_assn k ((LNode xs)#lns) (Some r) z =
 (\<exists>$_A$ xsi fwd.
      r \<mapsto>$_r$ Btleaf xsi fwd
    * is_pfa (2*k) xs xsi
    * leaf_nodes_assn k lns fwd z
  )" |
  "leaf_nodes_assn k [] r z = \<up>(r = z)" |
  "leaf_nodes_assn _ _ _ _ = false"
\end{lstlisting}
\caption[Definition of \emph{leaf\_nodes\_assn}]{
    The refinement relation for leaf nodes comprises the refinement
    of the node content as well as the recursive property of linking correctly to the next node.
}
\label{fig:leaf-nodes-assn}
\end{figure}


However, we cannot show that a structurally consistent, unchanged \btree\
is still described by the combination of the two predicates.
The reason is that we cannot express that the linked leaf nodes
are precisely the leaf nodes on the lowest level of the trunk, depicted
in red in \cref{fig:btree-view-split}.

% TODO graphic explaining the issue
\begin{figure}
    \centering
    \includegraphics[width=1\linewidth]{btree-view-split.pdf}
    \caption[Split view of the \btree]
    {In order to obtain separate assertions about the concatenated leaf list (\emph{leaf\_nodes})
    and the internal nodes (\emph{trunk}) of the tree, the structure is abstractly split along the
    pointers marked in red, the \emph{fringe}. In order to be able to combine the \emph{leaf\_nodes} and the \emph{trunk} together,
    the \emph{fringe} has to be extracted and shared explicitly.}
    \label{fig:btree-view-split}
\end{figure}

The root of this problem is actually a feature of the refinement approach.
When stating that a part of the heap
refines some abstract data structure,
we make no or little statements about concrete memory locations or pointers.
This is useful, as it reduces the size of the specification
and the proof obligations.
In this case it gets in our way.

% vis?

We cannot express that the fringe of the trunk refines the same abstract leaves
that are refined by the leaf list,
as this would violate the disjointness of heaps.
Even if we did, this statement would not be strong enough
to guarantee that the actual memory locations
are the same.
We need to specifically express that these pointers,
and not the abstract structure they refine,
are precisely the same in the two statements.

In a second attempt we succeed by making the sharing explicit.
We extract from the whole tree the precise list of pointers to leaf nodes, the \emph{fringe}
in the correct order.
The fringe is then part of the assertion about the tree.
Recursively, the fringe of a tree is the concatenation of all fringes
in its subtrees.
The resulting assertion can be seen in \cref{fig:btree-assn-leaves}.
As a convenient fact, this assertion is equivalent to \cref{fig:btree-assn}.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-extract-fringe]
lemma bplustree_extract_fringe:
    "bplustree_assn k t ti r z = ($\exists_A$fringe. bplustree_assn_fringe k t ti r z fringe)"
\end{lstlisting}


\begin{figure}
    \centering
\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-assn-leaves]
fun bplustree_assn_fringe where
    bplustree_assn_fringe k (LNode xs) a r z fringe =
    $\exists_A$ xsi fwd.
        a $\mapsto_r$ Btleaf xsi fwd
      * is_pfa (2*k) xs xsi
      * $\uparrow$(fwd = z)
      * $\uparrow$(r = Some a)
      * $\uparrow$(fringe = [a])
    |
    bplustree_assn_fringe k (Node ts t) a r z fringe =
    $\exists_A$ tsi ti tsi' tsi'' rs split.
        a $\mapsto_r$ Btnode tsi ti
      * bplustree_assn_fringe k t ti (last (r#rs)) (last (rs@[z])) (last split)
      * is_pfa (2*k) tsi' tsi
      * $\uparrow$(concat split = fringe)
      * $\uparrow$(length tsi' = length rs)
      * $\uparrow$(length split = length rs + 1)
      * list_assn (
          ($\lambda$ t (ti,r',z',fring). bplustree_assn_fringe k t (the ti) r' z' fring)
           $\times_a$ id_assn
        ) ts (zip 
            (zip (map fst tsi') (zip (butlast (r#rs)) (zip rs (butlast split))))
            (map snd tsi')
        )
       
\end{lstlisting}
    \caption[\btree\ assertion with extracted fringe]{
        An extended version of the \btree\ assertion from \cref{fig:btree-assn}.
        In order to be able to correctly relate leaf view and internal nodes,
        the shared pointers \emph{fringe} are made explicit, without accessing their memory location.
    }
    \label{fig:btree-assn-leaves}
\end{figure}


Using the \emph{fringe}, we can precisely state an equivalent separated assertion.
We describe the trunk with the assertion \emph{trunk\_assn},
which is the same as \emph{bplustree\_assn\_fringe},
except that the \emph{LNode} case is changed to only $\uparrow(r = \mathit{Some}\ a \wedge \mathit{fringe} = [a])$.
In addition, we extend the definition of \emph{leaf\_nodes\_assn}
to take the \emph{fringe} pointers into account.
We now require that the \emph{fringe} of the trunk is
precisely the list of pointers in the linked list refining \emph{leaf\_nodes}.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-view-split]
lemma bplustree_view_split:
  "bplustree_assn_fringe k t ti r z fringe =
   leaf_nodes_assn k (leaf_nodes t) r z fringe * trunk_assn k t ti r z fringe"
\end{lstlisting}

To obtain an iterator on the leaf nodes of the tree,
we obtain the first leaf of the tree.
By the formulation of the tree assertion, we can express
the obtained result using the assertion about the complete tree.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-first-leaf]
lemma assumes "k > 0" and "root_order k t" shows
  "<bplustree_assn k t ti r z>
  first_leaf ti
  <$\lambda$u. bplustree_assn k t ti r z * $\uparrow$(u = r)>$_t$"
\end{lstlisting}

On the result, we can apply lemmas \hyperref[lst:btree-extract-fringe]{\emph{bplustree\_extract\_fringe}} 
and \hyperref[lst:btree-view-split]{\emph{bplustree\_view\_split}}.
The transformed expression states that
the result of \emph{first\_leaf t} is a pointer to \emph{leaf\_nodes t}.
The tree root \emph{t} remains to refine \emph{trunk t}.

From here, we could define an iterator over the leaf nodes
along the fringe, refining the abstract list \emph{leaf\_nodes}.
However our final goal is to iterate over the values within each array inside the nodes.
We introduce a flattening iterator for this purpose.
It takes an outer iterator over a data structure \textit{a} that returns elements of type \textit{b},
and inner iterator over the data structure \textit{b}.
It returns an iterator over the concatenated list of elements.
In this case the inner structure would be the partially filled array
stored in each leaf.
Therefore we need an outer iterator not over the leaves, but over the arrays
contained within.
The exact implementation of this iterator is left out as a technical detail,
and we can find an equivalent formulation of the leaf list and the list of arrays,
which we call \emph{leaves\_assn}.

We define an iterator on this list assertion,
fulfilling the list iterator interface defined by Lammich \cite{DBLP:conf/itp/Lammich19}.
The iterator stores the pointer to the next element to be returned from the list.
The iterator interface requires some functionality.
\begin{itemize}
    \item An \emph{init} function that returns the pointer to the head of the list.
    \item A \emph{has\_next} function that checks whether the current pointer is the null pointer.
    \item A \emph{next} function that returns the the array in the current node and its next pointer.
    \item Proofs that we can transform the \emph{leaves\_assn} statement into 
          a leaf iterator statement and vice versa.
\end{itemize}
We provide all of it and show that the linked leaf nodes of the \btree\ form a valid
list of arrays that can be iterated over.
We combine this iterator with the iterator over partially filled arrays
in the flattening iterator and obtain an iterator over all leaf values \emph{leaf\_values\_iter}.

Finally, we want be able to express that the whole tree does not change throughout the iteration.
For this, we need to keep track of both the leaf nodes assertion and the trunk assertion on \emph{t}.
The assertion describing the iterator therefore contains both.
It also existentially quantifies the fringe, hiding away
the fact that it was extracted in the first place from the client perspective.
Note how all notion of the explicitly shared leaf pointers
has disappeared on this level, as their existence was hidden within the definition
of the tree iterator.


\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-tree-iter]
definition "bplustree_iter k t ti r vs it = \<exists>$_A$ fringe.
  leaf_values_iter fringe k (leaf_nodes t) (leaves t) r vs it *
  trunk_assn k t ti r None fringe"
\end{lstlisting}

The initializer using the \emph{first\_leaf} operation
defined before now allows us to obtain an iterator over all leaf
values of the tree.
Using the iterator functionalities defined by the flattening operator,
the values can be obtained step by step.
The operations \emph{bplustree\_iter\_next} and \emph{bplustree\_iter\_has\_next}
are exactly the respective operations defined for \emph{leaf\_values\_iter}, renamed.

\begin{lstlisting}
lemma assumes "k > 0" and "root_order k t" shows
  "<bplustree_assn k t ti r None>
  bplustree_iter_init ti
  <$\lambda$it. bplustree_iter k t ti r (leaves t) it>$_t$"

lemma assumes "vs \<noteq> [] shows
  <bplustree_iter k t ti r vs it>
  bplustree_iter_next it
  <\<lambda>(a, it'). bplustree_iter k t ti r (tl vs) it' * \<up> (a = hd vs)>\<^sub>t"

lemma 
  "<tree_iter k t ti r vs it>
  bplustree_iter_has_next it
  <\<lambda>r'. bplustree_iter k t ti r vs it * \<up> (r' = (vs \<noteq> []))>\<^sub>t"

lemma "bplustree_iter k t ti r vs it \<Longrightarrow>\<^sub>A bplustree_assn k t ti r None * true"
  
\end{lstlisting}


\subsection{Range queries}
\label{sec:imperative_range}

A common use case of \btrees\ 
to obtain all values within a range \cite{DBLP:journals/ftdb/Graefe11}.
We focus on the range of values in the tree bounded only from below by $x$,
denoted by \emph{lrange t x}.
An iterator over this range can be obtained in logarithmic time.
The operation is similar to the point query operation.
On the leaf level, it returns a pointer to the
reached leaf, that is interpreted as iterator on the list of linked leaves.
The range bounded from below comprises all values returned by the iterator,
the lower bound is its first element.
Due to the lack of links on the abstract layer,
the abstract definition explicitly concatenates all values in the subtrees 
to the right of the reached node.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-lrange]
fun lrange:: "'a bplustree $\Rightarrow$ 'a $\Rightarrow$ 'a list" where
    lrange (Leaf ks) x = (lrange_list x ks) |
    lrange (Node ts t) x = (
        case split ts x of (_,(sub,sep)#rs) $\Rightarrow$ (
               lrange sub x @ leaves_list rs @ leaves t
        )
     | (_,[]) $\Rightarrow$ lrange t x
    )
\end{lstlisting}
  
As before, we assume that there exists a function \textit{lrange\_list} that
obtains the \emph{lrange} from a list of sorted values.

The verification of the imperative version turns out to be not as straightforward
as expected, exactly due to this recursive step.
The reason is that iterators can only be expressed on a complete tree,
where the last leaf is explicitly a null pointer.
The issue is a technicality. The \textit{has\_next} function
in the iterator returns whether there are any remaining elements.
We compare the current leaf with the last leaf of the tree.
If the last leaf is a valid leaf node and not a null pointer, and
the linked list supposedly empty, we need to show
that the linked leaf list is not cyclic.
We avoid this proof obligation by requiring that the last leaf is a null pointer.
The linked list of a subtree is however bounded by valid leaves,
precisely the first leaf of the next subtree.

Therefore we introduce an alternative formulation \emph{concat\_leaves\_range} of the
abstract function, similar in thought to how we obtained the iterator
on the list from the first leaf of the tree.
In a first step, we obtain the list of leaf nodes \emph{leaves\_range} (not the contents of them)
based on the recursive search through the tree.
In a second step, we obtain the head of \emph{leaves\_range} and apply \emph{lrange\_list},
to skip over the first values in the first array that are not part of the \emph{lrange}.
The result is concatenated with the tail of \emph{leaves\_range}.

On the imperative layer \emph{leaves\_range}
can be obtained using only the \emph{leaf\_nodes} and \emph{trunk}
assertions.
Only when we have obtained the list of leaves for the whole tree,
we transform the result into an iterator over the leaves.
At this point, the list is terminated by a null pointer
and not the first leaf of the next sibling,
such that we can obtain an iterator with the existing definition.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-leaves-range]
fun leaves_range:: 'a bplustree $\Rightarrow$ 'a $\Rightarrow$ 'a bplustree list where
  leaves_range (Leaf ks) x = [Leaf ks] |
  leaves_range (Node ts t) x = (
      case split ts x of (_,(sub,sep)#rs) $\Rightarrow$ (
             leaves_range sub x @ leaf_nodes_list rs @ leaf_nodes t
      )
   | (_,[]) $\Rightarrow$ leaves_range t x
  )

fun concat_leaves_range where
  concat_leaves_range t x = (case leaves_range t x of (LNode ks)#list $\Rightarrow$
    lrange_list x ks @ (concat (map leaves list))
  )
\end{lstlisting}

Here, we apply the process of abstract refinement again.
We first formulate \emph{concat\_leaves\_range} on the abstract layer
and verify that it yields the same result as \emph{lrange}.
Then we refine the approach to the imperative layer
and can directly deduce that the approach yields the correct result.

\begin{lstlisting}[mathescape=true, language=Isabelle,label=lst:btree-leaves-range]
lemma assumes "k > 0" and "root_order k t" 
    and "sorted_less (leaves t)" and "Laligned t u" shows 
 "<bplustree_assn k t ti r None>
  imp_concat_leaves_range ti x
  <tree_iter k t ti r (lrange t x)>$_t$"
\end{lstlisting}

% TODO vis


\section{Conclusion}
\label{sec:conclusion}

We were able to formally verify an imperative implementation
of the ubiquitous \btree\ data structure.
The implementation features functionality that has not been 
featured in previous implementations,
covering range queries and efficient binary search.

\subsection{Lessons learned}

Handling separation logic formulae has always been
a bit tedious throughout the research.
A major alleviation was the introduction of a specialized tool
that would substitute multiplicative terms in the formular
regardless of the disctribution in the original term.
It allowes i.e. the substitution of $a * c = d * e * f$ in
the term $a * b * c$, yielding $d * e * f * c$.
This was particularly useful for incrementally
modifying equivalences of separation logic formulas.

What is currently missing in the implementation of the entailment
solving tool is to eliminate multiplicative terms that
already entail one another.
The entailment $a * b * c \Rightarrow c * e * a$ would
then be processed to the remaining proof obligation $b \Rightarrow e$
and not stopping without any elimination in case of failure to prove the entailment.


% On a side note, it was interesting to notice that it is rather complicated
% to even express, not to speak of attempting to prove acyclicity
%  along the leaf nodes of the tree, as soon as the linked list was converted into an iterator.
% Fortunately, we found a way around this issue, by only considering
% the complete list of nodes whenever we turned them into
% iterators. In that case, they were terminated by \textit{None}
% which we could guarantee not to occur earlier in the list.

\subsection{Evaluation}


The \btree\ implemented by Ernst \emph{et al.} \cite{DBLP:journals/sosym/ErnstSR15} features point queries and insertion,
however explicitly leaves out pointers within the leaves,
which forbids the implementation of iterators.
Our work is closer in nature to the \btree\ implementation by Malecha \emph{et al.} \cite{DBLP:conf/popl/MalechaMSW10}.
In addition to the functionality dealt with in their work, we extend
the implementation with a missing Range iterator
and supply a binary search within nodes.
Our approach is modular, allowing for the substitution of parts
of the implementation with even more specialized and sophisticated implementations.

Regarding the leaf iterator, we noticed that in the work of Malecha \emph{et al.}
there is no need to extract the fringe explicitly.
The abstract leaves are defined such that they store the precise heap location of the refining node.
In this definition, the precise heap location
is irrelevant in almost every situation and can be omitted,
only its content is relevant to the user.
Only when splitting the tree we obtain the memory location
of nodes explicitly, and then only those locations that are needed to guarantee
that the whole tree is structually sound.
It is hard to quantify or evaluate which approach
is more elegant in this respect.
From a theoretical view point
we suggest that an approach that is less strict
about the heap state restricts the implementation space less
and leaves more design decisions to the person implementing the specification.


With respect to the effort in lines of code and proof
as depicted in \cref{fig:proof-comparison}, we see
that our approach is similar in effort to the approach by Malecha \emph{et al.}.
The numbers do not include the newly defined pure ML proof tactics.
It should be also noted that this includes the statistics
for the additional binary search and range iterator,
that make up around 1000 lines of proof each.
The comparison with Ernst \emph{et al.} is difficult.
Their research completely avoids the usage of leaf pointers,
therefore also omitting iterators completely.
The iterator verification makes up a signifant amount of the proof
with at least 1000 lines of proof on its own.
The leaf pointers also affect the verification of point and insertion queries
due to the additional invariant on the imperative level.
We conclude that the Isabelle/HOL framework
provides a feature set
such that verification of \btrees\ is both feasible
and comparable in effort to using Ynot or KIV/TVLA.
The strict separation of a functional and imperative
implementation yields the challenge
of making memory locations explicit where needed.
On the other hand, it permits great freedom
regarding the actual refinement on the imperative level.

\begin{figure}
    \centering
    \begin{tabular}{l|c|c|c}
        \                & \cite{DBLP:conf/popl/MalechaMSW10}$^{+}$ & \cite{DBLP:journals/sosym/ErnstSR15}$^{d}$ & Our approach$^{+}$ \\
        \hline
        Functional code &   360      & -                    & 413  \\ %TODO update
        Imperative code &   510      & 1862                  & 1093  \\
        Proofs          &  5190      & 350 + 510 + 2940\footnotemark[3] & 8663 \\
        Timeframe (months) &  -     & 6+                      & 6\footnotemark[4] + 6   \\
    \end{tabular}
    \caption[Comparison of (unoptimized) Lines of Code and Proof and time investment in related mechanized \btree\ verifications.]
    {Comparison of (unoptimized) Lines of Code and Proof and time investment in related mechanized \btree\ verifications.
    All approaches are comparable in effort, taking into account implementation specifics.
    The marker $^d$ denotes that the implementation verifies deletion operations, whereas $^+$ denotes the implementation of iterators.
    }
    \label{fig:proof-comparison}
\end{figure}
\footnotetext[3]{
    The proof integrates TVLA and KIV, and hence comprises
    explicitly added rules for TVLA (the first number),
    user-invented theorems in KIV (the second number)
    and "interactions" with KIV (the second number).
    Interactions are i.e. choices of an induction variable, quantifier instantiation
    or application of correct lemmas.
    We hence interpret them as each one apply-Style command and hence
    one line of proof.
}
\footnotetext[4]{
    6 months include the preceding work on the verification
    of simple B-Trees.
    As they share much of the functionality with \btrees\ 
    but required their own specifics,
    the time spent on them cannot be accounted for 1:1.
}

% \subsection{Future Work}

% This work extends the canon on verified \btree\ operations.
% A small addition to provide a true
% range query can be provided by wrapping the lower-bounded
% range iterator with an iterator
% that stops iteration when the next element
% would exceed the upper bound.

% Further, this work is still lacking a verified imperative version
% of the deletion function.
% We already provided an abstract specification
% and an imperative refinement and have seen
% that all araising issues are along the lines
% of the proof obligations in the refinement
% proofs of point queries and insertion.

% One important step towards efficient
% \btree\ implementations that can be deployed
% in realistic scenarios, is the migration of this code
% to Isabelle-LLVM. \cite{DBLP:conf/itp/Lammich19}
% At the beginning of this work, the code generator did
% not yet support recursive data structures, but since
% this functionality was added recently \btrees\ would be an interesting application.
% Expectedly, the main difference will be the usage of Bitwords
% instead of unbound natural numbers.

% Finally, \btrees\ are supposed to write the contained
% memory out to disk in order to maintain the durability constraint
% usually required for databases.
% This implementation lacks any interaction with IO systems.
% We expect the formalization of correct interaction 
% to be a particulary tricky task.

% TODO some analysis of the code generated? even necessary?

\bibliography{lipics-v2021-bplustrees}

\end{document}