#!/bin/bash

# simplified multi-stage, optimized copy
#
# *******************************************************************************************************
#
# Starfish Storage Corporation ("Starfish") CONFIDENTIAL
# Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.
#
# NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
# Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
# intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
# U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
# Dissemination of this information or reproduction of this material is strictly forbidden unless prior
# written permission is obtained from Starfish. Access to the source code contained herein is hereby
# forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
# confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
# Starfish's software.
#
# ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
# THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
# AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
# FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
# DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
# WHOLE OR IN PART.
#
# FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
#   These notices shall be marked on any reproduction of this data, in whole or in part.
#   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
#   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
#   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
#   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
#   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
#   Software clause at DFARS 52.227-7013.
#
# *********************************************************************************************************
#

# Change Log
# 1.8   2022-08-22  add --check-security-info to copy command for windows and always copy dirs
# 1.7.2 2022-02-23  move prescan before hardlink check
# 1.7.1 2022-01-04  rename rsync_wrapper to copy
# 1.7   2021-12-13  add bug fixes for hard linking and fast-fail adjustments
# 1.6.3 2021-08-09  (DG) expanded help
# 1.6.2 2021-08-09  fix issue with hard links and destination directory not existing;
#                   this can happen if a directory consists of only hard links
# 1.6.1 2021-05-11  fix --no-prescan accidental squash in symlinks
# 1.6 2021-04-29    fix --no-prescan accidental squash
# 1.5 2021-04-26    enable copy hardlink of symlinks; make prescan singular
# 1.4 2021-03-31    make hardlink process much more efficient (when necessary)
# 1.3 2021-03-31    fix hardlink re-ification, adding check for dirs without files only for dir copy step
# 1.2 2020-12-24    add hardlink support, skip pre-scan, copy empty dirs
# 1.1 2020-12-10    add -h and -p
set -euo pipefail

#set -x

usage () {
    echo "usage: $0 [-hpbqdw] [-j <jobname>] SRCVOL:[PATH] DSTVOL:[PATH]"
    echo "       -b      Skip files with 0 blocks (WOS/Cloud pool)"
    echo "       -h      don't run hardlink checks; skip ahead to copy"
    echo "       -p      do a prescan on source volume before copy"
    echo "       -q      suppress sub-job output stats for sfcopy"
    echo "       -d      deprecated option; no longer used. (compatability)"
    echo "       -j <jn> override the default job name with <jn>"
    echo "       -w      force windows/SMB-specific copy argument --check-security-info (mostly automatic)"
    echo ""
    echo "Starfish utility to optimize copying of data disk to disk."
    echo "This uses five phases:"
    echo " 1. Copy large files, no hardlinks, 1m-1p"
    echo " 2. Copy small files, no hardlinks, 0b-1m"
    echo " 3. Copy symlinks, all sizes"
    echo " 4. Copy empty directories"
    echo " 5. Create hardlinks on target"
    echo ""
    echo "If using multiple instances of copyit.sh on the same data, it is"
    echo "recommended to use -j to keep a consistent job name for Starfish"
    echo "state-keeping purposes."
    echo ""
    echo "Examples:"
    echo "Standard run options:"
    echo "    ./copyit.sh -d SFVOL1: SFVOL2:"
    echo ""
    echo "Copying subidirectories, skip empty directories:"
    echo "    ./copyit.sh SFVOL1:mydir SFVOL2:path/to/targetdir"
    exit 1
}

# initialized to empty
WINDOWS_ARGS=""

DIRS_WORKERS=${COPYIT_DIRS_WORKERS:-64}
LARGE_FILES_WORKERS=${COPYIT_LARGE_FILES_WORKERS:-8}
SMALL_FILES_WORKERS=${COPYIT_SMALL_FILES_WORKERS:-64}
SYMLINK_WORKERS=${COPYIT_SYMLINK_WORKERS:-64}
HARDLINK_WORKERS=${COPYIT_HARDLINK_WORKERS:-12}

while getopts '?hbpqdwj:' OPTION
do
case $OPTION in
    b)
        SKIPWOS=1
        ;;
    h)
        SKIPHL=1
        ;;
    p)
        PRESCAN=1
        ;;
    q)
        QUIET=1
        ;;
    d)
	# deprecated/compatibility
        ;;
    j)
        JOBNAME=$OPTARG
        ;;
    w)
        WINDOWS_ARGS="--check-security-info"
        ;;
    ?)
        usage
        ;;
    *)
        echo "invalid option $1"
        usage
        ;;
    esac
done
# shellcheck disable=SC2004
shift $(($OPTIND -1))

if [ $# -ne 2 ]; then
    usage
fi

srcvp="$1"
dstvp="$2"
srcvol=$(echo "$srcvp" | cut -d: -f1)
srcpath=$(echo "$srcvp" | cut -d: -f2-)
dstvol=$(echo "$dstvp" | cut -d: -f1)
dstpath=$(echo "$dstvp" | cut -d: -f2-)

# figure out whether to use the security info flag on windows automatically
vtype=$(sf volume show "${srcvol}" --format type) || {
    echo "source volume not found. check spelling for '$srcvp'"
    exit 1
}
if [ "$vtype" = "Windows" ]; then
    WINDOWS_ARGS="--check-security-info"
fi

APPEND=""

if [ ${QUIET:-0} -eq 1 ]; then
    APPEND=">/dev/null"
fi

if [ "${PRESCAN:-0}" -eq 1 ]; then
    echo "running prescan"
    sf scan start -t diff "$srcvp" --wait
fi

if [ ${SKIPHL:-0} -eq 0 ]; then
    echo "Checking for hard links"
    links=$(sf query "$srcvp" --nlinks:gt 1 -H --type f --format ino | sort -u)
    nlinks=$(echo "$links" | wc -w)
    echo "found $nlinks distinct hard linked items"
else
    echo "Skipping hard link checks"
fi

if [ "${JOBNAME:-X}" != "X" ]; then
    echo "setting jobname to $JOBNAME"
    JOBNAME="--job-name $JOBNAME"
else
    JOBNAME=""
fi

if [ ${SKIPWOS:-0} -eq 1 ]; then
    echo "skipping 0 block files"
    APPEND="${APPEND} --not --blocks 1:1E"
fi

echo "[1mDo not press CTRL-C[0m"
echo "recreating directories"
# because we're doing eval and counting on expansion of $APPEND into discrete args
# shellcheck disable=SC2086
eval sf job start --job-name copy --quiet --job-retry-count 0 --agent-fail-fast-threshold 95 \
    --agent-fail-fast-min-batches 50 --no-prescan "${JOBNAME}" --wait \"copy ${WINDOWS_ARGS}\" \
    \"${srcvp}\" \"${dstvp}\" --workers-per-agent "${DIRS_WORKERS}" --type d ${APPEND}

echo "copying large files for throughput."
# shellcheck disable=SC2086
eval sf job start --job-name copy --quiet --job-retry-count 0 --agent-fail-fast-threshold 95 \
    --agent-fail-fast-min-batches 10 --no-prescan ${JOBNAME} --wait \
    \"copy --inplace ${WINDOWS_ARGS}\" \"${srcvp}\" \"${dstvp}\" --size 1M-1P \
    --workers-per-agent "${LARGE_FILES_WORKERS}" --nlinks 1 ${APPEND}

echo ""
echo "copying remainder of small files without hardlinks"
# shellcheck disable=SC2086
eval sf job start --job-name copy --quiet --job-retry-count 0 --agent-fail-fast-threshold 98 \
    --agent-fail-fast-min-batches 10 --no-prescan ${JOBNAME} --wait \
    \"copy --inplace ${WINDOWS_ARGS}\" \"${srcvp}\" \"${dstvp}\" --size 0-1M \
    --workers-per-agent "${SMALL_FILES_WORKERS}" --nlinks 1 ${APPEND}
echo "copying any remaining symlinks"
# shellcheck disable=SC2086
eval sf job start --job-name copy --quiet --job-retry-count 0 --agent-fail-fast-threshold 95 \
    --agent-fail-fast-min-batches 50 --no-prescan ${JOBNAME} --wait \
    \"copy --inplace ${WINDOWS_ARGS}\" \"${srcvp}\" \"${dstvp}\" --type l \
    --workers-per-agent "${SYMLINK_WORKERS}" ${APPEND}

# get mount point from volume (really only need the first one. this assumes the same mount
# on machine running copyit (master) as on all agents. Need to convert dict syntax to json.
dstmount=$(sf volume show "${dstvol}" --format mounts | tr "'" '"' | jq -c '.[]' | head -1 | tr -d '"')

function filepart(){
    local srcpath="$1"
    local file="$2"
    # shellcheck disable=SC2001
    # because paths have /
    echo "${file}" | sed -e "s|^${srcpath}||"
}
export IFS=$'\n'

if [ "${nlinks:-0}" -ne 0 ]; then
    echo ""
    echo "reifying hard links (this may take seconds per link to copy data. Standby)"
    echo "note: this cannot remove any hard link from target that was removed from source."

    # first path generate file list of first item by sfid per inode
    # to run a bulk copy on the master inodes
    # _sfid guarantees ordering between loops
    for inode in ${links}; do
        master=$(sf query -H "${srcvp}" --type f --inode "$inode" --format full_path --sort-by _id --limit 1)
        dst_file_part=$(filepart "${srcpath}" "$master")
        # skip leading /
        echo "${dst_file_part#/}" >> /tmp/linkrun.txt.$$
    done
    # use a modest number of workers per agent
    echo batch copying first entry of every link group
    # shellcheck disable=SC2086
    eval sf job start --job-name copy --quiet --agent-fail-fast-threshold 95 --agent-fail-fast-min-batches 20 \
        --no-prescan --from-file /tmp/linkrun.txt.$$ "${JOBNAME}" --wait \"copy ${WINDOWS_ARGS}\" \
        \"${srcvp}\" \"${dstvp}\" --workers-per-agent "${HARDLINK_WORKERS}" ${APPEND}
    rm /tmp/linkrun.txt.$$

    # do the loop again, but fix perms and make links
    for inode in ${links}; do
        declare -a arr
        mapfile -t arr < <(sf query -H "${srcvp}" --type f --inode "$inode" --format full_path --sort-by _id)
        # The destination path doesn't have to be the same as the source path
        # dstfp is the destination volpath (prefix) with the file part of the file name.
        dst_file_part=$(filepart "${srcpath}" "${arr[0]}")
        link1="${dstmount}/${dstpath}/${dst_file_part}"

        for i in $(seq 1 $(( ${#arr[@]} - 1 )) ); do
            dst_file_part=$(filepart "${srcpath}" "${arr[${i}]}")
            if [ ! -f "${dstmount}/${dstpath}/${dst_file_part}" ]; then
                ln "${link1}" "${dstmount}/${dstpath}/$dst_file_part"
            fi
        done
    done
fi


echo ""
echo "Done"
