#!/opt/starfish/examples/venv/bin/python3
"""
***********************************************************************************************************

 Starfish Storage Corporation ("Starfish") CONFIDENTIAL
 Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.

 NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
 Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
 intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
 U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
 Dissemination of this information or reproduction of this material is strictly forbidden unless prior
 written permission is obtained from Starfish. Access to the source code contained herein is hereby
 forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
 confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
 Starfish's software.

 ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
 THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
 AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
 FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
 DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
 WHOLE OR IN PART.

 FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
   These notices shall be marked on any reproduction of this data, in whole or in part.
   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
   Software clause at DFARS 52.227-7013.

***********************************************************************************************************
version 1.0.3
    complete refactor of gzip/grep handling
    use threads for efficiency
    complete refactor of loops and functions

version 1.0.2
    Add support for multiple NFS mounted agent dirs; refactor log file scanning globs

version 1.0.1
    Add support for upload/download/restore
"""
import argparse
import glob
import gzip
import json
import re
import sys
from argparse import RawTextHelpFormatter
from multiprocessing.pool import ThreadPool


def gzdecomp(local_fname):
    """decompress gzip and return metadata header
    :returns: decompressed buffer
    """
    try:
        buf = gzip.open(local_fname).read()
    except Exception:
        print(
            f"gzip file ({local_fname}) is unreadable or corrupt. Skipping.",
            file=sys.stderr,
        )
        buf = "".encode("UTF-8")
    return buf


def filter_json(in_json_str):
    """take a job input list in json (e.g. upload/download) and return just the paths as list"""
    outlist = list()
    # for item in json.loads(inlist):
    injson = json.loads(in_json_str)
    if args.debug > 2:
        print("input list is: " + str(injson))
    for listel in injson:
        if args.debug > 1:
            print("list el is " + str(listel))
        outlist.append(listel["local_path"])

    return outlist


def get_files(databuf):
    """take a NULL separated input databuf and make determinations about how it's
    arranged and then separate into paths
    :returns: path list"""

    split_buf = databuf.split("\0")
    if args.debug:
        print("split_buf length is " + str(len(split_buf)))
    if len(split_buf) == 0:
        return set()
    elif str(split_buf) == "['']":
        if args.debug > 1:
            print("return special empty set")
        file_list = []
    elif len(split_buf) > 1:
        if args.debug > 1:
            print("Null separated: " + str(len(split_buf)))
        if args.debug > 1:
            print("----\n" + str(split_buf) + "\n----")
        file_list = split_buf
    # is it json inside?
    elif isinstance(split_buf, list):
        if re.match(r"^[[}]", split_buf[0]):
            # sub-json
            js = json.loads(split_buf[0])
            if js:
                file_list = filter_json(split_buf[0])
            else:
                file_list = filter_json(split_buf)
        else:
            print("sb:" + str(split_buf))
            file_list = str(split_buf)
    else:
        print("sb:" + str(split_buf))
        file_list = filter_json(databuf)

    return file_list


def parse_log_file(fname, updownjob):
    """parse a compressed or plain logfile. Pay attention if it's upload/download/restore and handle appropriately
    :returns: a set of items
    """
    if args.debug:
        print("parsing file: " + fname)

    gz = re.compile(r"\.gz")
    outfile = re.search(r"\.out$|\.out.gz", fname)

    if gz.search(fname):
        if args.debug > 0:
            print("  gz file")
        if not updownjob:
            if args.debug > 1:
                print("not upload/download so result set is normal text")
            out_results = gzdecomp(fname).decode("UTF-8").split("\0")
            if outfile:
                # outfile is fname\0result\0fname\0result\0
                if args.debug > 1:
                    print("picking every other")
                out_results = out_results[::2]
        else:
            databuf = gzdecomp(fname).decode("UTF-8")
            out_results = get_files(databuf)
    else:
        if not updownjob:
            out_results = open(fname, "rb").read().decode("UTF-8").split("\0")
            if outfile:
                # outfile is fname\0result\0fname\0result\0
                if args.debug > 1:
                    print("picking every other")
                out_results = out_results[::2]
        else:
            databuf = open(fname, "rb").read().decode("UTF-8")
            out_results = get_files(databuf)

    if args.debug > 1:
        print("\n".join(out_results))
    return set(out_results)


def check_preverification(flist):
    """take an input file list consisting of job stderr files
    print out pre-verification mismatch lines
    :returns: nothing"""

    pvmatch = re.compile(r'pre-verification: ([mc]time changed from \d+ to \d+ for [^"]+)"],')
    for pvfile in flist:
        # print("checking file: " + pvfile)
        with open(pvfile, "r") as pvfid:
            if args.debug:
                print("reading stderr file: " + pvfile)
            for line in pvfid:
                m = pvmatch.search(line)
                if m:
                    # print(line, end="")
                    print(m.group(1))


def check_glob_logs(jobdirs, pool):
    """accepts a glob list of job directories that match
    :returns: nothing"""

    for jdir in jobdirs:
        print(f"checking logs for jobdir {jdir}")

        upload_download_job = 0
        if re.search(r"(_download_|_upload_|_restore_)", jdir):
            if args.debug > 1:
                print("upload/download/restore job")
            upload_download_job = 1

        # now get list of in and out files (possibly with .gz)
        infiles = glob.glob(f"{jdir}/cmd_executor-*/*.in*")
        outfiles = glob.glob(f"{jdir}/cmd_executor-*/*.out*")

        errfiles = glob.glob(f"{jdir}/*.stderr")
        logfiles = glob.glob(f"{jdir}/cmd_executor-*.log")

        if args.debug > 1:
            print("infiles and outfiles")
            print(infiles)
            print("---")
            print(outfiles)

        check_preverification(errfiles)
        check_preverification(logfiles)

        # filter out -rsync files from logs
        # could also be --rsync-0.in and --rsync-1.in, etc.
        norsync = re.compile(r"-rsync(-\d+)?\.(in|out)")
        infiles = [x for x in infiles if not norsync.search(x)]
        outfiles = [x for x in outfiles if not norsync.search(x)]
        if args.debug:
            print("processing log files input / output")
            print(infiles)
            print(outfiles)
        if not infiles and not outfiles:
            print("job used argv passing mechanism. no in/out files to compare.")

        find_differences(infiles, outfiles, upload_download_job, pool)


def find_differences(infiles, outfiles, upload_download_job, pool):
    """find any differences between input files and output files and print any relevant errors
    corresponding to them.
    :returns: None"""

    index = 0
    threadpool = []
    # thread_array = []
    while index < len(infiles):
        infile = infiles[index]
        outfile = outfiles[index]

        # t = threading.Thread(target=check_differences, args=(infile, outfile, upload_download_job))
        threadpool.append(pool.apply_async(check_differences, (infile, outfile, upload_download_job)))
        index += 1

    totalinset = set()
    totaloutset = set()
    for async_result in threadpool:
        tup = async_result.get()
        in_els, out_els = tup
        for el in in_els:
            totalinset.add(el)
        for el in out_els:
            totaloutset.add(el)

    # shorten for very large sets
    diffset = totalinset.intersection(totaloutset)
    if diffset:
        if args.debug:
            print("subtracting differences: " + str(diffset))
        totalinset = totalinset - diffset
        totaloutset = totaloutset - diffset

    if args.debug:
        print("totalin:" + str(totalinset))
        print("totalout:" + str(totaloutset))

    difflist = totalinset - totaloutset
    count = len(difflist)
    print(str(count) + " paths found in job input but not output")

    if args.showfiles and count != 0:
        for file in sorted(difflist):
            print(file)


def check_differences(infile, outfile, upload_download_job):
    """take a list of files in input list and a list in output list
    find differences between the two and print them"""

    inset = parse_log_file(infile, upload_download_job)
    outset = parse_log_file(outfile, upload_download_job)

    if args.showfiles:
        m = re.search(r"(.*)\.(in$|in\.gz$)", infile)
        if m:
            if args.debug > 0:
                print(f"decompressing {infile}")
            errfile = m.group(1) + ".stderr"
            if args.debug > 1:
                print("looking for " + errfile)
            if glob.glob(errfile + ".gz"):
                errfile += ".gz"
                databuf = gzdecomp(errfile).decode("UTF-8")
                if args.debug:
                    print("uncompressing stderr file " + errfile)
            else:
                if args.debug:
                    print("reading normal error file: " + errfile)
                try:
                    databuf = open(errfile, "r").read().decode("UTF-8")
                except AttributeError:
                    databuf = open(errfile, "r").read()
                except FileNotFoundError:
                    databuf = ""
            for file in inset - outset:
                if args.debug > 1:
                    print("file is " + file)
                matchline = r"(^ERR.*" + re.escape(file) + r".*)$"
                if args.debug > 1:
                    print("searching for " + matchline)
                m = re.search(matchline, databuf, flags=re.MULTILINE)
                if m:
                    print(m.group(1))
        else:
            print("no match for job stderr file. skipping.")
    return inset, outset


# ---------------------------------------------------------------------------------------------------


if __name__ == "__main__":
    pool = ThreadPool(processes=32)

    epilog = """\nusage:\n/opt/starfish/bin/examples/agent/jobfail.py --job 79
        Note: this is functionally obsolete by sf job issues, but included
        for example purposes"""

    parser = argparse.ArgumentParser(
        description="jobfail.py will find and show some of the "
        "more common errors in jobs. "
        "It does not yet have the capability to "
        "definitively find all possible errors. "
        "use the short form job id with -j.",
        epilog=epilog,
        formatter_class=RawTextHelpFormatter,
    )
    parser.add_argument(
        "--agent-logs",
        required=False,
        help="Glob-style location for agent log directories",
    )
    parser.add_argument("--job", "-j", required=True, help="Starfish jobid")
    parser.add_argument(
        "--showfiles",
        "-s",
        action="store_true",
        required=False,
        help="Show files in input not present itn output",
    )
    parser.add_argument("--debug", "-d", default=0, type=int, required=False, help="enable debugging")
    args = parser.parse_args()

    job = args.job
    globlog = glob.glob(f"/opt/starfish/log/jobs/cmd_executor-j_*_{job}_0")
    if len(globlog) > 1:
        print(
            "That's weird, I see more than one possible match on that jobid. "
            "That shouldn't happen. Exiting. Please notify support."
        )
        print(str(globlog))
        sys.exit(1)

    if len(globlog) == 0:
        print("No matching job log directory. Aborting")
        sys.exit(1)

    if args.agent_logs:
        agentlogs = glob.glob(args.agent_logs + f"/jobs/cmd_executor-j_*_{job}_0")
        if args.debug:
            print(f"found {len(agentlogs)} matching agent log dirs")
        check_glob_logs(agentlogs, pool)
    else:
        check_glob_logs(glob.glob(f"/opt/starfish/log/jobs/cmd_executor-j*_{job}_0"), pool)
