@ -1,12 +1,17 @@
#!/usr/bin/env python3
#!/usr/bin/env python3
import csv
import csv
import math
import os
import os
import random
import random
import sys
import sys
import matplotlib
matplotlib . use ( " Agg " )
import matplotlib . backends . backend_pdf
import matplotlib . backends . backend_pdf
import matplotlib . pyplot as plt
import matplotlib . pyplot as plt
import numpy as np
import numpy as np
import pandas as pd
import seaborn as sns
# Make sure a legend has the same color across all generated graphs.
# Make sure a legend has the same color across all generated graphs.
@ -19,7 +24,7 @@ def get_cmap(n, name="hsv"):
color_index = 0
color_index = 0
bar_color_maps = { }
bar_color_maps = { }
colors = [ ]
colors = [ ]
n_colors = 60
n_colors = 3 60
linear_colors = get_cmap ( n_colors )
linear_colors = get_cmap ( n_colors )
for i in range ( n_colors ) :
for i in range ( n_colors ) :
colors . append ( linear_colors ( i ) )
colors . append ( linear_colors ( i ) )
@ -35,41 +40,95 @@ def num_to_gb(n):
return " {0:.2f} " . format ( float ( n ) / one_gb )
return " {0:.2f} " . format ( float ( n ) / one_gb )
def plot_miss_ratio_graphs ( csv_result_dir , output_result_dir ) :
def plot_miss_stats_graphs (
mrc_file_path = csv_result_dir + " /mrc "
csv_result_dir , output_result_dir , file_prefix , file_suffix , ylabel , pdf_file_name
if not os . path . exists ( mrc_file_path ) :
) :
return
miss_ratios = { }
miss_ratios = { }
print ( " Processing file {} " . format ( mrc_file_path ) )
for file in os . listdir ( csv_result_dir ) :
with open ( mrc_file_path , " r " ) as csvfile :
if not file . startswith ( file_prefix ) :
rows = csv . reader ( csvfile , delimiter = " , " )
continue
is_header = False
if not file . endswith ( file_suffix ) :
for row in rows :
continue
if not is_header :
print ( " Processing file {} / {} " . format ( csv_result_dir , file ) )
is_header = True
mrc_file_path = csv_result_dir + " / " + file
continue
with open ( mrc_file_path , " r " ) as csvfile :
cache_name = row [ 0 ]
rows = csv . reader ( csvfile , delimiter = " , " )
num_shard_bits = int ( row [ 1 ] )
for row in rows :
ghost_capacity = int ( row [ 2 ] )
cache_name = row [ 0 ]
capacity = int ( row [ 3 ] )
num_shard_bits = int ( row [ 1 ] )
miss_ratio = float ( row [ 4 ] )
ghost_capacity = int ( row [ 2 ] )
config = " {} - {} - {} " . format ( cache_name , num_shard_bits , ghost_capacity )
capacity = int ( row [ 3 ] )
if config not in miss_ratios :
miss_ratio = float ( row [ 4 ] )
miss_ratios [ config ] = { }
config = " {} - {} - {} " . format ( cache_name , num_shard_bits , ghost_capacity )
miss_ratios [ config ] [ " x " ] = [ ]
if config not in miss_ratios :
miss_ratios [ config ] [ " y " ] = [ ]
miss_ratios [ config ] = { }
miss_ratios [ config ] [ " x " ] . append ( num_to_gb ( capacity ) )
miss_ratios [ config ] [ " x " ] = [ ]
miss_ratios [ config ] [ " y " ] . append ( miss_ratio )
miss_ratios [ config ] [ " y " ] = [ ]
miss_ratios [ config ] [ " x " ] . append ( capacity )
miss_ratios [ config ] [ " y " ] . append ( miss_ratio )
fig = plt . figure ( )
for config in miss_ratios :
plt . plot (
miss_ratios [ config ] [ " x " ] , miss_ratios [ config ] [ " y " ] , label = config
)
plt . xlabel ( " Cache capacity " )
plt . ylabel ( ylabel )
plt . xscale ( " log " , basex = 2 )
plt . ylim ( ymin = 0 )
plt . title ( " {} " . format ( file ) )
plt . legend ( )
fig . savefig (
output_result_dir + " / {} .pdf " . format ( pdf_file_name ) , bbox_inches = " tight "
)
def plot_miss_stats_diff_lru_graphs (
csv_result_dir , output_result_dir , file_prefix , file_suffix , ylabel , pdf_file_name
) :
miss_ratios = { }
for file in os . listdir ( csv_result_dir ) :
if not file . startswith ( file_prefix ) :
continue
if not file . endswith ( file_suffix ) :
continue
print ( " Processing file {} / {} " . format ( csv_result_dir , file ) )
mrc_file_path = csv_result_dir + " / " + file
with open ( mrc_file_path , " r " ) as csvfile :
rows = csv . reader ( csvfile , delimiter = " , " )
for row in rows :
cache_name = row [ 0 ]
num_shard_bits = int ( row [ 1 ] )
ghost_capacity = int ( row [ 2 ] )
capacity = int ( row [ 3 ] )
miss_ratio = float ( row [ 4 ] )
config = " {} - {} - {} " . format ( cache_name , num_shard_bits , ghost_capacity )
if config not in miss_ratios :
miss_ratios [ config ] = { }
miss_ratios [ config ] [ " x " ] = [ ]
miss_ratios [ config ] [ " y " ] = [ ]
miss_ratios [ config ] [ " x " ] . append ( capacity )
miss_ratios [ config ] [ " y " ] . append ( miss_ratio )
if " lru-0-0 " not in miss_ratios :
return
fig = plt . figure ( )
fig = plt . figure ( )
for config in miss_ratios :
for config in miss_ratios :
plt . plot ( miss_ratios [ config ] [ " x " ] , miss_ratios [ config ] [ " y " ] , label = config )
diffs = [ 0 ] * len ( miss_ratios [ " lru-0-0 " ] [ " x " ] )
plt . xlabel ( " Cache capacity (GB) " )
for i in range ( len ( miss_ratios [ " lru-0-0 " ] [ " x " ] ) ) :
plt . ylabel ( " Miss Ratio ( % ) " )
for j in range ( len ( miss_ratios [ config ] [ " x " ] ) ) :
# plt.xscale('log', basex=2)
if miss_ratios [ " lru-0-0 " ] [ " x " ] [ i ] == miss_ratios [ config ] [ " x " ] [ j ] :
plt . ylim ( ymin = 0 )
diffs [ i ] = (
plt . title ( " RocksDB block cache miss ratios " )
miss_ratios [ config ] [ " y " ] [ j ] - miss_ratios [ " lru-0-0 " ] [ " y " ] [ i ]
)
break
plt . plot ( miss_ratios [ " lru-0-0 " ] [ " x " ] , diffs , label = config )
plt . xlabel ( " Cache capacity " )
plt . ylabel ( ylabel )
plt . xscale ( " log " , basex = 2 )
plt . title ( " {} " . format ( file ) )
plt . legend ( )
plt . legend ( )
fig . savefig ( output_result_dir + " /mrc.pdf " , bbox_inches = " tight " )
fig . savefig (
output_result_dir + " / {} .pdf " . format ( pdf_file_name ) , bbox_inches = " tight "
)
def sanitize ( label ) :
def sanitize ( label ) :
@ -143,6 +202,7 @@ def read_data_for_plot(csvfile, vertical):
def plot_line_charts (
def plot_line_charts (
csv_result_dir ,
csv_result_dir ,
output_result_dir ,
output_result_dir ,
filename_prefix ,
filename_suffix ,
filename_suffix ,
pdf_name ,
pdf_name ,
xlabel ,
xlabel ,
@ -151,11 +211,14 @@ def plot_line_charts(
vertical ,
vertical ,
legend ,
legend ,
) :
) :
global color_index , bar_color_maps , colors
pdf = matplotlib . backends . backend_pdf . PdfPages ( output_result_dir + " / " + pdf_name )
pdf = matplotlib . backends . backend_pdf . PdfPages ( output_result_dir + " / " + pdf_name )
for file in os . listdir ( csv_result_dir ) :
for file in os . listdir ( csv_result_dir ) :
if not file . endswith ( filename_suffix ) :
if not file . endswith ( filename_suffix ) :
continue
continue
print ( " Processing file {} " . format ( file ) )
if not file . startswith ( filename_prefix ) :
continue
print ( " Processing file {} / {} " . format ( csv_result_dir , file ) )
with open ( csv_result_dir + " / " + file , " r " ) as csvfile :
with open ( csv_result_dir + " / " + file , " r " ) as csvfile :
x , labels , label_stats = read_data_for_plot ( csvfile , vertical )
x , labels , label_stats = read_data_for_plot ( csvfile , vertical )
if len ( x ) == 0 or len ( labels ) == 0 :
if len ( x ) == 0 or len ( labels ) == 0 :
@ -163,10 +226,15 @@ def plot_line_charts(
# plot figure
# plot figure
fig = plt . figure ( )
fig = plt . figure ( )
for label_index in label_stats :
for label_index in label_stats :
# Assign a unique color to this label.
if labels [ label_index ] not in bar_color_maps :
bar_color_maps [ labels [ label_index ] ] = colors [ color_index ]
color_index + = 1
plt . plot (
plt . plot (
[ int ( x [ i ] ) for i in range ( len ( x ) ) ] ,
[ int ( x [ i ] ) for i in range ( len ( x ) - 1 ) ] ,
label_stats [ label_index ] ,
label_stats [ label_index ] [ : - 1 ] ,
label = labels [ label_index ] ,
label = labels [ label_index ] ,
color = bar_color_maps [ labels [ label_index ] ] ,
)
)
# Translate time unit into x labels.
# Translate time unit into x labels.
@ -239,10 +307,29 @@ def plot_stacked_bar_charts(
pdf . close ( )
pdf . close ( )
def plot_access_timeline ( csv_result_dir , output_result_dir ) :
def plot_heatmap ( csv_result_dir , output_result_dir , filename_suffix , pdf_name , title ) :
pdf = matplotlib . backends . backend_pdf . PdfPages (
" {} / {} " . format ( output_result_dir , pdf_name )
)
for file in os . listdir ( csv_result_dir ) :
if not file . endswith ( filename_suffix ) :
continue
csv_file_name = " {} / {} " . format ( csv_result_dir , file )
print ( " Processing file {} / {} " . format ( csv_result_dir , file ) )
corr_table = pd . read_csv ( csv_file_name )
corr_table = corr_table . pivot ( " label " , " corr " , " value " )
fig = plt . figure ( )
sns . heatmap ( corr_table , annot = True , linewidths = 0.5 , fmt = " .2 " )
plt . title ( " {} filename: {} " . format ( title , file ) )
pdf . savefig ( fig )
pdf . close ( )
def plot_timeline ( csv_result_dir , output_result_dir ) :
plot_line_charts (
plot_line_charts (
csv_result_dir ,
csv_result_dir ,
output_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " access_timeline " ,
filename_suffix = " access_timeline " ,
pdf_name = " access_time.pdf " ,
pdf_name = " access_time.pdf " ,
xlabel = " Time " ,
xlabel = " Time " ,
@ -253,6 +340,109 @@ def plot_access_timeline(csv_result_dir, output_result_dir):
)
)
def convert_to_0_if_nan ( n ) :
if math . isnan ( n ) :
return 0.0
return n
def plot_correlation ( csv_result_dir , output_result_dir ) :
# Processing the correlation input first.
label_str_file = { }
for file in os . listdir ( csv_result_dir ) :
if not file . endswith ( " correlation_input " ) :
continue
csv_file_name = " {} / {} " . format ( csv_result_dir , file )
print ( " Processing file {} / {} " . format ( csv_result_dir , file ) )
corr_table = pd . read_csv ( csv_file_name )
label_str = file . split ( " _ " ) [ 0 ]
label = file [ len ( label_str ) + 1 : ]
label = label [ : len ( label ) - len ( " _correlation_input " ) ]
output_file = " {} / {} _correlation_output " . format ( csv_result_dir , label_str )
if output_file not in label_str_file :
f = open ( " {} / {} _correlation_output " . format ( csv_result_dir , label_str ) , " w+ " )
label_str_file [ output_file ] = f
f . write ( " label,corr,value \n " )
f = label_str_file [ output_file ]
f . write (
" {} , {} , {} \n " . format (
label ,
" LA+A " ,
convert_to_0_if_nan (
corr_table [ " num_accesses_since_last_access " ] . corr (
corr_table [ " num_accesses_till_next_access " ] , method = " spearman "
)
) ,
)
)
f . write (
" {} , {} , {} \n " . format (
label ,
" PA+A " ,
convert_to_0_if_nan (
corr_table [ " num_past_accesses " ] . corr (
corr_table [ " num_accesses_till_next_access " ] , method = " spearman "
)
) ,
)
)
f . write (
" {} , {} , {} \n " . format (
label ,
" LT+A " ,
convert_to_0_if_nan (
corr_table [ " elapsed_time_since_last_access " ] . corr (
corr_table [ " num_accesses_till_next_access " ] , method = " spearman "
)
) ,
)
)
f . write (
" {} , {} , {} \n " . format (
label ,
" LA+T " ,
convert_to_0_if_nan (
corr_table [ " num_accesses_since_last_access " ] . corr (
corr_table [ " elapsed_time_till_next_access " ] , method = " spearman "
)
) ,
)
)
f . write (
" {} , {} , {} \n " . format (
label ,
" LT+T " ,
convert_to_0_if_nan (
corr_table [ " elapsed_time_since_last_access " ] . corr (
corr_table [ " elapsed_time_till_next_access " ] , method = " spearman "
)
) ,
)
)
f . write (
" {} , {} , {} \n " . format (
label ,
" PA+T " ,
convert_to_0_if_nan (
corr_table [ " num_past_accesses " ] . corr (
corr_table [ " elapsed_time_till_next_access " ] , method = " spearman "
)
) ,
)
)
for label_str in label_str_file :
label_str_file [ label_str ] . close ( )
plot_heatmap (
csv_result_dir ,
output_result_dir ,
" correlation_output " ,
" correlation.pdf " ,
" Correlation " ,
)
def plot_reuse_graphs ( csv_result_dir , output_result_dir ) :
def plot_reuse_graphs ( csv_result_dir , output_result_dir ) :
plot_stacked_bar_charts (
plot_stacked_bar_charts (
csv_result_dir ,
csv_result_dir ,
@ -301,6 +491,7 @@ def plot_reuse_graphs(csv_result_dir, output_result_dir):
plot_line_charts (
plot_line_charts (
csv_result_dir ,
csv_result_dir ,
output_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " reuse_blocks_timeline " ,
filename_suffix = " reuse_blocks_timeline " ,
pdf_name = " reuse_blocks_timeline.pdf " ,
pdf_name = " reuse_blocks_timeline.pdf " ,
xlabel = " " ,
xlabel = " " ,
@ -370,14 +561,90 @@ def plot_access_count_summary(csv_result_dir, output_result_dir):
vertical = True ,
vertical = True ,
x_prefix = " < " ,
x_prefix = " < " ,
)
)
plot_line_charts (
csv_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " skewness " ,
pdf_name = " skew.pdf " ,
xlabel = " " ,
ylabel = " Percentage of accesses " ,
title = " Skewness " ,
vertical = True ,
legend = False ,
)
def plot_miss_ratio_timeline ( csv_result_dir , output_result_dir ) :
plot_line_charts (
csv_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " 3600_miss_ratio_timeline " ,
pdf_name = " miss_ratio_timeline.pdf " ,
xlabel = " Time " ,
ylabel = " Miss Ratio ( % ) " ,
title = " Miss ratio timeline " ,
vertical = False ,
legend = True ,
)
plot_line_charts (
csv_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " 3600_miss_timeline " ,
pdf_name = " miss_timeline.pdf " ,
xlabel = " Time " ,
ylabel = " # of misses " ,
title = " Miss timeline " ,
vertical = False ,
legend = True ,
)
plot_line_charts (
csv_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " 3600_miss_timeline " ,
pdf_name = " miss_timeline.pdf " ,
xlabel = " Time " ,
ylabel = " # of misses " ,
title = " Miss timeline " ,
vertical = False ,
legend = True ,
)
plot_line_charts (
csv_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " 3600_policy_timeline " ,
pdf_name = " policy_timeline.pdf " ,
xlabel = " Time " ,
ylabel = " # of times a policy is selected " ,
title = " Policy timeline " ,
vertical = False ,
legend = True ,
)
plot_line_charts (
csv_result_dir ,
output_result_dir ,
filename_prefix = " " ,
filename_suffix = " 3600_policy_ratio_timeline " ,
pdf_name = " policy_ratio_timeline.pdf " ,
xlabel = " Time " ,
ylabel = " Percentage of times a policy is selected " ,
title = " Policy timeline " ,
vertical = False ,
legend = True ,
)
if __name__ == " __main__ " :
if __name__ == " __main__ " :
if len ( sys . argv ) < 3 :
if len ( sys . argv ) < 3 :
print (
print (
" Must provide two arguments: 1) The directory that saves a list of "
" Must provide two arguments: \n "
" directories which contain block cache trace analyzer result files "
" 1) The directory that saves a list of "
" 2) the directory to save plotted graphs. "
" directories which contain block cache trace analyzer result files. \n "
" 2) the directory to save plotted graphs. \n "
)
)
exit ( 1 )
exit ( 1 )
csv_result_dir = sys . argv [ 1 ]
csv_result_dir = sys . argv [ 1 ]
@ -396,8 +663,59 @@ if __name__ == "__main__":
print ( " Processing experiment dir: {} " . format ( csv_relative_dir ) )
print ( " Processing experiment dir: {} " . format ( csv_relative_dir ) )
if not os . path . exists ( result_dir ) :
if not os . path . exists ( result_dir ) :
os . makedirs ( result_dir )
os . makedirs ( result_dir )
plot_miss_ratio_graphs ( csv_abs_dir , result_dir )
plot_access_count_summary ( csv_abs_dir , result_dir )
plot_access_timeline ( csv_abs_dir , result_dir )
plot_timeline ( csv_abs_dir , result_dir )
plot_miss_ratio_timeline ( csv_result_dir , output_result_dir )
plot_correlation ( csv_abs_dir , result_dir )
plot_reuse_graphs ( csv_abs_dir , result_dir )
plot_reuse_graphs ( csv_abs_dir , result_dir )
plot_percentage_access_summary ( csv_abs_dir , result_dir )
plot_percentage_access_summary ( csv_abs_dir , result_dir )
plot_access_count_summary ( csv_abs_dir , result_dir )
plot_miss_stats_graphs (
csv_abs_dir ,
result_dir ,
file_prefix = " " ,
file_suffix = " mrc " ,
ylabel = " Miss ratio ( % ) " ,
pdf_file_name = " mrc " ,
)
plot_miss_stats_diff_lru_graphs (
csv_abs_dir ,
result_dir ,
file_prefix = " " ,
file_suffix = " mrc " ,
ylabel = " Miss ratio ( % ) " ,
pdf_file_name = " mrc_diff_lru " ,
)
# The following stats are only available in pysim.
for time_unit in [ " 1 " , " 60 " , " 3600 " ] :
plot_miss_stats_graphs (
csv_abs_dir ,
result_dir ,
file_prefix = " ml_ {} _ " . format ( time_unit ) ,
file_suffix = " p95mb " ,
ylabel = " p95 number of byte miss per {} seconds " . format ( time_unit ) ,
pdf_file_name = " p95mb_per {} _seconds " . format ( time_unit ) ,
)
plot_miss_stats_graphs (
csv_abs_dir ,
result_dir ,
file_prefix = " ml_ {} _ " . format ( time_unit ) ,
file_suffix = " avgmb " ,
ylabel = " Average number of byte miss per {} seconds " . format ( time_unit ) ,
pdf_file_name = " avgmb_per {} _seconds " . format ( time_unit ) ,
)
plot_miss_stats_diff_lru_graphs (
csv_abs_dir ,
result_dir ,
file_prefix = " ml_ {} _ " . format ( time_unit ) ,
file_suffix = " p95mb " ,
ylabel = " p95 number of byte miss per {} seconds " . format ( time_unit ) ,
pdf_file_name = " p95mb_per {} _seconds_diff_lru " . format ( time_unit ) ,
)
plot_miss_stats_diff_lru_graphs (
csv_abs_dir ,
result_dir ,
file_prefix = " ml_ {} _ " . format ( time_unit ) ,
file_suffix = " avgmb " ,
ylabel = " Average number of byte miss per {} seconds " . format ( time_unit ) ,
pdf_file_name = " avgmb_per {} _seconds_diff_lru " . format ( time_unit ) ,
)