#!/bin/bash

# a little script whipped up to estimate compression ratio from select extensions, for HMS
# it picks 5 examples of exach extension from starfish, runs an xz on them, and prints the original size
# the compressed size, and the compression ratio on stdout
# 2020-03-13 - add glacier and deep archive estimates
# 2020-05-14 - added example of using with snapshot
# 2021-02-11 - noted costs used in help
#***********************************************************************************************************
#
# Starfish Storage Corporation ("Starfish") CONFIDENTIAL
# Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.
#
# NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
# Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
# intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
# U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
# Dissemination of this information or reproduction of this material is strictly forbidden unless prior
# written permission is obtained from Starfish. Access to the source code contained herein is hereby
# forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
# confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
# Starfish's software.
#
# ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
# THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
# AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
# FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
# DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
# WHOLE OR IN PART.
#
# FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
#   These notices shall be marked on any reproduction of this data, in whole or in part.
#   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
#   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
#   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
#   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
#   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
#   Software clause at DFARS 52.227-7013.
#
#***********************************************************************************************************


COMPRESSOR="xz -c"
LEVEL=6
THREADS=4

usage() {
    echo "Given a volume and mountpoint, estimate the compressibility by "
    echo "extension, total compressibility and AWS storage costs per"
    echo "S3 class of storage. It takes 5 samples of files per extension,"
    echo "runs a compression test on them and averages them."
    echo ""
    echo "Price per GB/month used:"
    echo "S3 Standard:          $.021"
    echo "S3 Infrequent Access: $.0125"
    echo "S3 Glacier:           $.004"
    echo "S3 Deep Archive:      $.00099"
    echo ""

    echo usage "compression_estimator.sh -v <volume> -m <mountpoint> [ -p <volume subpath> ] \\"
    echo "    [ -c <compressor> ] \\"
    echo "    [ -l <compression_level> ] \\"
    echo "    [ -t <threads> ]"
    echo ""
    echo "We recommend running this in screen or tmux, since it may take a while to run"
    echo ""

    echo "default compression is xz level 6 (good)"
    echo "when using xz, default thread count is 4."
    echo "e.g. ./compression_estimator.sh -v home -m /home -c 'gzip -c' -l 5"
    echo ""
    echo "Example when using snapshot:"
    echo "./compression_estimator.sh -v home -m /home/.snapshot/daily_2020_05_04/ -c xz -l 6 -t 8"
}

pid=$$

while getopts "v:m:p:c:l:t:h" opt; do
  case $opt in
      v) VOLUME="$OPTARG" ;;
      m) MOUNT="$OPTARG" ;;
      l) LEVEL="$OPTARG" ;;
      c) COMPRESSOR="$OPTARG" ;;
      p) SUBPATH="$OPTARG" ;;
      t) THREADS="$OPTARG" ;;
      h) usage; exit 0;;
      ?|*) usage; exit 1;;
  esac
done

if [ "$VOLUME" = "" ] || [ "$MOUNT" = "" ]; then
    usage
fi

VOLPATH=${VOLUME}:${SUBPATH}
MNTPOINT=${MOUNT}/

if [[ $COMPRESSOR =~ xz ]]; then
    COMPRESSOR="${COMPRESSOR} -T ${THREADS}"
fi

echo "getting extensions"
sf query "$VOLPATH" --group-by ext --type f | grep --extended-regexp 'TiB|GiB' | sort -k4,4n | tee /tmp/used_by_ext

echo "saving partial results to /tmp/comp_avgs.$pid"

# shellcheck disable=SC2013
for ext in $(awk '{print $1}' /tmp/used_by_ext); do
    sf query "$VOLPATH" -H --ext "$ext" --size 100K-1G --limit 5 --format +size | \
        sed -e "s?${VOLPATH}?$MNTPOINT?" | grep --extended-regexp -v 'gfs[0-9]{4,}' | \
        gawk 'NF == 2 {"'"$COMPRESSOR -$LEVEL"' < " $1 " | wc -c" | getline csize; cnt = split($1, a, "."); print tolower(a[cnt]), $2, csize, 100-(100.0*csize/$2)}' | tee -a /tmp/comp_avgs.$pid
done

echo "net reduction"

awk '$3 == 0 {getline;}
{src[$1] += $2; dst[$1] += $3};
END {
    for (type in src) {
        print tolower(type), (1.0*dst[type]/src[type])
    }
}' /tmp/comp_avgs.$pid > /tmp/reductions.$pid

awk -v reductfile=/tmp/reductions.$pid '
BEGIN {
    while (getline < reductfile > 0) {
        reduc[$1] = $2;
    }
    print "---Space used per extension---"
};
{ sumtot += $3;}
!($1 in reduc) { reductot += $3 }
$1 in reduc {
    reductot += reduc[$1] * $3;
    printf("%-10s %15d\n", $1, reduc[$1] * $3);
}
END {
    gbtot = sumtot/(1000*1000*1000)
    gbreduced = reductot/(1000*1000*1000);
    s3cost = gbtot * .021
    cs3cost = gbreduced * .021
    iacost = gbtot * .0125
    ciacost = gbreduced * .0125
    printf("%10s %10s Savings\n", "TotalGB", "ReducedTo", "Reduce%");
    printf("%10d %10d %5.2f%%\n", gbtot,  gbreduced, 100.0-(100.0*reductot/sumtot));
    print "S3 estimated costs $/mo";
    printf("%-15s %10s %10s %10s %10s\n", " ", "Standard", "IA", "Glacier", "Deep Arc");
    printf("%-15s %10.2f %10.2f %10.2f %10.2f\n", "Uncompressed", s3cost, iacost, gbtot*.004, gbtot*.00099);
    printf("%-15s %10.2f %10.2f %10.2f %10.2f\n", "Compressed", cs3cost, ciacost, gbreduced*.004, gbreduced*.00099);
}' /tmp/used_by_ext

exit 0
