#!/bin/bash
#***********************************************************************************************************
#
# Starfish Storage Corporation ("Starfish") CONFIDENTIAL
# Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.
#
# NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
# Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
# intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
# U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
# Dissemination of this information or reproduction of this material is strictly forbidden unless prior
# written permission is obtained from Starfish. Access to the source code contained herein is hereby
# forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
# confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
# Starfish's software.
#
# ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
# THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
# AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
# FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
# DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
# WHOLE OR IN PART.
#
# FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
#   These notices shall be marked on any reproduction of this data, in whole or in part.
#   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
#   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
#   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
#   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
#   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
#   Software clause at DFARS 52.227-7013.
#
#***********************************************************************************************************

set -euo pipefail

PROG="$0"

failure(){
    echo "$*" >&2
    exit 1
}

usage() {
    cat <<EOF
    ${PROG} <SCAN_ID>

    Show failed entries for given scan.
EOF
}


test "$#" == "1" || { usage; exit 1; }

if [[ "$1" == "--help" || "$1" == "-h" ]]; then
    usage
    exit 0
fi

readonly SCAN_ID="$1"
readonly SF_LOG_DIR="${SFHOME:-/opt/starfish}/log"
readonly SCAN_LOG_DIR="${SF_LOG_DIR}/jobs/crawler-${SCAN_ID}"

# below zgrep is used to unpack log files and find fixed string (that option should be quite fast)
# and to avoid printing any line with substring 'Broken name' or scan_id
# use sed with --silent option and print lines that matched expression (p command)

if [[ -d "${SCAN_LOG_DIR}" ]]; then
    echo "Looking for crawler errors in ${SCAN_LOG_DIR}" >&2

    # those are patterns for older scans, before unification of errors
    zgrep -F 'Broken name' "${SCAN_LOG_DIR}"/*.err* | \
        sed --silent --regexp-extended \
            "s/.*Broken name, not UTF-8 encoded b?[\"'](.*)[\"']: 'utf-?8' codec can't .*/non-UTF-8: \\1/pg" || true
    zgrep -F 'empty filename(s) in directory' "${SCAN_LOG_DIR}"/*.err* | \
        sed --silent --regexp-extended \
            "s/.*Found [[:digit:]]+ empty filename\\(s\\) in directory b?[\"'](.*)[\"'].*/directory with empty filenames: \\1/pg" || true
    zgrep -F '[Errno' "${SCAN_LOG_DIR}"/*.err* | \
        sed --silent --regexp-extended "s/.*\\[Errno [[:digit:]]+\\] (.*): '(.*)' \\[.+:[[:digit:]]+\\]$/\\1: \\2/pg" || true
    zgrep -F 'contains more than' "${SCAN_LOG_DIR}"/*.err* | \
        sed --silent --regexp-extended "s/.*Directory '(.*)' contains more than [[:digit:]]+ entries: ([[:digit:]]+).*$/[huge-dir] \\1 contains \\2 items/pg" || true
    zgrep -F 'contains more than' "${SCAN_LOG_DIR}"/*.err* | \
        sed --silent --regexp-extended "s/.*Ignoring directory '(.*)', because it contains more than ([[:digit:]]+) entries.*$/[ignored-huge-dir] \\1 contains more than \\2 items/pg" || true

    # patterns for new scans with errors unified
    zgrep -F 'Crawler error' "${SCAN_LOG_DIR}"/*.err* | \
        sed --silent --regexp-extended "s/.*Crawler error: (.*) \\[.*:[[:digit:]]+\\]/\\1/pg" || true
else
    echo "Crawler log dir ${SCAN_LOG_DIR} not found. Run $0 on agent machine to find non-UTF-8 files and other errors" >&2
    exit 1
fi
