andreagus
/
db2-project


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
							\documentclass{beamer}
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{dirtytalk}
\usepackage{epstopdf}
\usepackage{hyperref}
\graphicspath{ {images/} }
\usetheme{CambridgeUS}
\usecolortheme{beaver}

\AtBeginSection[]
{
  \begin{frame}
    \frametitle{Table of Contents}
    \tableofcontents[currentsection]
  \end{frame}
}

\title{Databases 2 - Optional Presentation}
\author{Andrea Gussoni}
\institute{Politecnico di Milano}
\date{July 15, 2016}

\begin{document}

\frame{\titlepage}

\section{Coordination Avoidance}

\begin{frame}
\frametitle{Some information on the paper}
\begin{itemize}
\item \textbf{Title:} Coordination Avoidance in Database Systems.
\item \textbf{Authors:} Peter Bailis, Alan Fekete, Michael J. Franklin, Ali Ghodsi, Joseph M. Hellerstein, Ion Stoica.
\item Presented at \textbf{2015 VLDB}.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{The Problem}
At the present time, Database Systems in a distributed scenario are increasingly common. This means that the task of coordinating different entities is assuming a lot of importance.
\end{frame}

\begin{frame}
\frametitle{The Problem}
Usually \textbf{concurrency control} protocols are necessary because we want to guarantee the consistency of the application level data through the use of a database layer that check and solve the possible problems and conflicts. An example can be the use of a 2PL serialization technique that is often used in commercial DBMS.
\end{frame}

\begin{frame}
\frametitle{The Problem}
Mixing this with a distributed scenario means the necessity to introduce \textbf{complex algorithms} (such as 2PC) that coordinate the various entities involved in the transactions, \textbf{introducing latency}. Coordination also means that we cannot exploit all the parallel resources of a distributed environment, because we have a huge overhead introduced by the coordination phase.
\end{frame}

\begin{frame}
\frametitle{The Problem}
Usually we pay coordination overhead in term of:
\begin{itemize}
  \item Increased latency.
  \item Decreased throughput.
  \item Unavailability (in case of failures).
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{The Problem}
\begin{figure}
\caption{Microbenchmark performance of coordinated and coordination-free execution on eight separate multi-core servers.}
\centering
\includegraphics[width=0.85\textwidth,height=0.60\textheight]{2pl-free}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Invariant Confluence}
The authors of the paper discuss this new technique (or better \textbf{analysis framework}) that if applied, it will reduce in a considerable way the need of coordination between the Database entities, reducing the cost in terms of bandwidth and latency, increasing considerably the overall throughput of the system.
\end{frame}

\begin{frame}
\frametitle{Invariant Confluence}
The main idea here is not to introduce some new exotic way to improve the coordination task, but instead the authors predicate on the fact that there is a set of workloads that do \textbf{not require coordination}, and that can be executed in parallel. The programmer at the application level can then state in an explicit way the \emph{invariants}, special attributes of the tables that need coordination in case of concurrent operations executing on them.
\end{frame}

\begin{frame}
\frametitle{The Model}
The main concepts introduced:
\begin{itemize}
  \item Invariants \pause
  \item Transactions \pause
  \item Replicas \pause
  \item (\emph{I-})Convergence \pause
  \item Merging
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Convergence}
This is a figure that explains the main concept behind the idea of convergence:
\includegraphics[width=\textwidth]{convergence}
\end{frame}

\begin{frame}
\frametitle{Coordination-Free Execution}
Here instead we show the basic evolution of a simple coordination free execution and the consequent merging operation:
\includegraphics[width=\textwidth]{coordination-free}
\end{frame}

\begin{frame}
\frametitle{Invariants}
\begin{itemize}
  \item It is important to note that \textbf{coordination can only be avoided if all local commit decisions are globally valid.}\pause
  \item So the best approach to guarantee the application level consistency is to apply a convergence analysis and then identify the \textbf{true conflicts}. The uncertain situations must be threated in a conservative approach. \pause
  \item This means that we rely on the \textbf{analysis} done by the programmer at the application level to guarantee the correctness. This is clearly a drawback.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Invariants}
Luckily there are some standard situations for the analysis of invariants that we can use as \textbf{boilerplate} in the building of the set of invariants of our application, this figure summarizes the main cases:
\centering
\includegraphics[width=0.85\textwidth,height=0.7\textheight]{invariants}
\end{frame}

\begin{frame}
\frametitle{Benchmarking}
\begin{itemize}
  \item The authors then proceeded to implement this new framework and test it with a standard benchmark, the \textbf{TPC-C} benchmark, that is said to be \say{the gold standard for database concurrency control both in research and industry.}
  \item They also used \textbf{RAMP} transactions, that are transactions that \say{employ limited multi-versioning and metadata to ensure that readers and writers can always proceed concurrently.}
  \item The selected language for the prototype is \textbf{Scala}, used for reason of compactness of the code.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Benchmarking}
In the next few slides there are some plots of the results obtained by the authors. The \textbf{New-Order} label refers to the fact that the authors, when an unique id assignment was needed, decided to assign a \emph{temp-ID}, and only just before the commit, a sequential (as required from the specifications of the benchamrk) \emph{real-ID} is assigned, and a table mapping \emph{tmp-ID} to \emph{real-ID} is created.
\end{frame}

\begin{frame}
\frametitle{Results}
\begin{figure}
\caption{TPC-C New-Order throughput across eight servers.}
\centering
\includegraphics[width=0.55\textwidth,height=0.73\textheight]{results1-1}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Results}
\begin{figure}
\caption{Coordination-avoiding New-Order scalability.}
\centering
\includegraphics[width=0.70\textwidth,height=0.70\textheight]{results1-2}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Conclusions}
This paper demonstrates that ACID transactions and associated strong isolation levels dominated the field of database concurrency. This is a \textbf{powerful abstraction} that automatically guarantee consistency at the application level. In a distributed scenario where we want to achieve \textbf{high scalability}, we can sacrifice these abstractions and perform an \textbf{I-Confluence} analysis in order to exploit scalability through \textbf{coordination-free} transactions
\end{frame}


\section{Trekking Through Siberia}

\begin{frame}
\frametitle{Some information on the paper}
\begin{itemize}
\item \textbf{Title:} Trekking Through Siberia: Managing Cold Data in a Memory-Optimized Database.
\item \textbf{Authors:} Ahmed Eldawy, Justin Levandoski, Per-Ake Larson.
\item Presented at \textbf{2014 VLDB}.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Introduction}
With the drop in memory prices a set of in \textbf{main memory} database emerged. While for most of OLTP workloads often this solution is reasonable, due to the fact that often databases exhibit a skewed access patterns that divide records in \textbf{hot} (frequently accessed) and \textbf{cold} (rarely accessed) it is still convenient to find a way to maintain the hot records in memory and the cold ones on for example flash storage, that is still a lot less expensive than memory.
\end{frame}

\begin{frame}
\frametitle{Introduction}
In this paper it is presented \textbf{Project Siberia}, an extension to the \textbf{Hekaton} engine of Microsoft SQL Server that aims to pursue these objectives:
\begin{itemize}
  \item Cold data classification.
  \item Cold data storage.
  \item Cold storage access reduction.
  \item \textbf{Cold data access and migration mechanism} (the focus of this paper is on this aspect).
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Hekaton}
This figure shows how the storage and indexing is done in Hekaton:
\centering
\includegraphics[width=0.70\textwidth,height=0.70\textheight]{hekaton-storage}
\end{frame}

\begin{frame}
\frametitle{Hekaton}
Hekaton utilizes optimistic multi-version concurrency control (MVCC), it mainly leverage these features of timestamps to obtain this:
\begin{itemize}
  \item Commit/End Time (useful to determine the serialization order).
  \item Valid Time.
  \item Logical Read Time (start time of the transaction).
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Some important data structures}
\begin{figure}
\caption{Structure of a record in the cold store.}
\centering
\includegraphics[width=0.30\textwidth,height=0.10\textheight]{cold-record}
\end{figure}
\begin{figure}
\caption{Structure of a cached record.}
\centering
\includegraphics[width=0.75\textwidth,height=0.15\textheight]{cached-record}
\end{figure}
\begin{figure}
\caption{Structure of a timestamp notice in the update memo.}
\centering
\includegraphics[width=0.70\textwidth,height=0.15\textheight]{timestamp-notice}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{The main operations on the cold storage}
\begin{itemize}
  \item \textbf{Insert:} done always in the hot storage.
  \item \textbf{Migration to cold storage:} is the only way to move a record from the hot storage to the cold one.
  \item \textbf{Delete.}
  \item \textbf{Updates:} a delete operation on the cold storage followed by an insertion in the hot storage.
  \item \textbf{Read.}
  \item \textbf{Update Memo and Cold Store Cleaning.}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Focus on migration}
\begin{figure}
\caption{Contents of cold store, hot store, and update memo during migration of a record.}
\centering
\includegraphics[width=0.60\textwidth,height=0.60\textheight]{cold-migration}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Focus on delete}
\begin{figure}
\caption{Effect on the cold store and update memo of a record deletion.}
\centering
\includegraphics[width=0.60\textwidth,height=0.70\textheight]{cold-delete}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Observations}
\begin{itemize}
  \item We need a new phase called \textbf{validation}, that checks just before a commit action that all the records used during the transactions still exist, are valid and have not been modified by another concurrent transaction.
  \item There is \textbf{no deletion} in the strict sense of the term. The to-delete records have they end timestamps changed, and the garbage collection remove the unused records (when all the transactions alive begun after the deletion).
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Benchmarks}
The authors utilized two types of benchmarks:
\begin{itemize}
  \item \textbf{YCSB Benchmark} (50GB single-table database, 1KB records) that is divided in:
  \begin{itemize}
    \item Read-heavy: 90\% reads and \% updates.
    \item Write-heavy: 50\% reads and 50\% writes.
    \item Read-only: 100\% reads.
  \end{itemize}
  \item \textbf{Multi-step read/update workload} (1GB single-table database, 56 bytes records) that is divided in:
  \begin{itemize}
    \item Read-only.
    \item Update-only.
  \end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{In-Memory Cold storage}
This analysis is done in order to isolate the overhead strictly caused by the Siberia framework, eliminating the latency of the I/O operations:
\begin{figure}
\caption{In-memory overhead of the Siberia framework.}
\centering
\includegraphics[width=0.7\textwidth,height=0.60\textheight]{in-memory-overhead}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Migration}
This analysis instead focuses on the performance degradation of various types of workload during a \textbf{live migration} to the cold storage of parts of the database:
\begin{figure}
\caption{In-memory overhead of the Siberia framework.}
\centering
\includegraphics[width=0.65\textwidth,height=0.55\textheight]{migration}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Read-only workload with I/O}
This analysis focuses on the performance degradation of a read-only workload with cold storage on flash (a similar analysis has been done for an update-only workload):
\begin{figure}
\centering
\includegraphics[width=0.8\textwidth,height=0.7\textheight]{io-read-only-a}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Read-only workload with I/O}
\begin{figure}
\centering
\includegraphics[width=0.9\textwidth,height=0.8\textheight]{io-read-only-b}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{YCSB benchmark}
\begin{figure}
\caption{YCSB write-heavy workload.}
\centering
\includegraphics[width=0.8\textwidth,height=0.7\textheight]{ycsb-write-heavy}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Conclusions}
There is related research in progress in this direction:
\begin{itemize}
  \item \textbf{Buffer pool:} page indirection on disk.
  \item \textbf{HyPer:} hybrid between OLTP and OLAP, optimizes data in chunks using different virtual memory pages, finds the cold data and compress for OLAP usage.
\end{itemize}\pause
\begin{block}{}
  The approach used in Siberia has the great advantage to have an access at \textbf{record level}, and for databases where the cold storage is between 10\% and 20\% of the whole database it has the great advantage of \textbf{not requiring additional structures} in memory (except the compact bloom-filters) for the cold data.
\end{block}
\end{frame}

\begin{frame}
\frametitle{License}
\centering
\includegraphics[width=0.3\textwidth]{license}\\
This work is licensed under a Creative Commons Attribution 4.0 International License. To view a copy of
this license, visit \url{http://creativecommons.org/licenses/by/4.0/}
or send a letter to Creative Commons, 444 Castro Street, Suite
900, Mountain View, California, 94041, USA.
\end{frame}

\end{document}