#!/opt/starfish/examples/venv/bin/python3
"""
***********************************************************************************************************

 Starfish Storage Corporation ("Starfish") CONFIDENTIAL
 Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.

 NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
 Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
 intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
 U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
 Dissemination of this information or reproduction of this material is strictly forbidden unless prior
 written permission is obtained from Starfish. Access to the source code contained herein is hereby
 forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
 confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
 Starfish's software.

 ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
 THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
 AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
 FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
 DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
 WHOLE OR IN PART.

 FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
   These notices shall be marked on any reproduction of this data, in whole or in part.
   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
   Software clause at DFARS 52.227-7013.

***********************************************************************************************************
"""
###############################################################################
#  Author Doug Hughes
#  Last modified 2021-12-03
#
# Run simple sql queries while removing the need to find the auth key or
# format the query
# This query outputs the query results, whatever they are, in a CSV output
# format. It does not currently take care of quoting.
#
# 2018-08-27 - add html table support
# 2021-10-10 - detect non-existant volume
# 2021-12-03 - add example (dg)
# 2023-03-15 - use tempfile module
# 2023-04-26 - Use os.execvp to run "sf job"
#
# WARNING: This script runs queries directly against the Starfish database
# can could be used to cause tremendous harm and render Starfish inoperable if
# used incorrectly. It is recommended to only run select and not update queries.

import os
import subprocess
import sys
import tempfile
import unittest
from subprocess import DEVNULL

try:
    import argparse
    import configparser
    import shlex

    import psycopg2
except Exception:
    print(
        "In order to use this, you must have configparser, argparse, shlex"
        "and psycopg2 modules installed (available via pip among "
        "other means.)"
    )
    sys.exit(1)


def _get_sf_path(*path_parts):
    return os.path.join(os.getenv("SFHOME", "/opt/starfish/"), *path_parts)


class TestQ(unittest.TestCase):
    """extension for unittest framework."""

    def setUp(self):
        pass

    def test_auth(self):
        """test connection"""
        conn = psycopg2.connect(getpgauth())  # pylint: disable=W0621
        self.assertNotEqual(conn, None)

    def test_query(self):
        """test sql return"""
        conn = psycopg2.connect(getpgauth())  # pylint: disable=W0621
        query = """select count(*) from sf_volumes.volume"""
        cur = conn.cursor()  # pylint: disable=W0621
        cur.execute(query)
        self.assertNotEqual(cur, None)
        rows = cur.fetchall()  # pylint: disable=W0621
        self.assertEqual(len(rows), 1)


def check_valid_volume(volume):
    """verify the volume is valid by running a simple check"""
    p = subprocess.run(["sf", "volume", "show", volume], stdout=DEVNULL, stderr=DEVNULL, check=False)
    return p.returncode == 0


def getpgauth():
    """pull auth info from config file to use implicitly"""
    try:
        config = configparser.ConfigParser()
        config.read(_get_sf_path("etc/99-local.ini"))
        return config.get(
            "pg",
            "pg_uri",
        )
    except OSError:
        print("can't read config file to get connection uri. check permissions.")
        sys.exit(1)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def get_connection():
    cfg_file = _get_sf_path("etc/99-local.ini")
    if not os.access(cfg_file, os.R_OK):
        print(f"no permissions to get the sql auth token from {cfg_file}. Try as root?")
        sys.exit(1)

    try:
        return psycopg2.connect(getpgauth())  # pylint: disable=W0621
    except psycopg2.DatabaseError as e:
        print("unable to connect to the database: {}".format(str(e)))
        sys.exit(1)


if __name__ == "__main__":
    delimiter = " "
    # Parse Arguments
    parser = argparse.ArgumentParser(
        description="""
Snapdup is a front-end, user-accessible program to enable an administrator
to run a tailored hashing operation leveraging storage snapshots.
This allows a hash to happen without updating the access times of files
which would skew aging stats.  Snapdup executes a job which calls hashsnap
on agent nodes to parallelize the job of collection hashes across many
machines. It first queries the database for active files to hash,
skipping files with unique sizes; it then substitutes the real path with
the snapshot path.  To collect hashes on all files (e.g. for auditing
or verification purposes)  use hashsnap.py instead.""",
        epilog=r"""Examples:

Hash the volume 'Isilon_Data', mounted at /mnt/ifsdata, using a top level
snapshot named in the format of Daily_<datestamp>', with the snapshot
path of /mnt/isidata/.snapshot:

  /opt/starfish/bin/examples/snapdup.py --volume Isilon_Data --snapshot-name \
  '/mnt/ifsdata/.snapshot/Schedule*' --snapshot-location top""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument(
        "--cutoffMIB", required=False, default=1, type=float, help="minimum MiB to consider when comparing"
    )
    parser.add_argument("--volume", required=True, help="volume which to check")
    parser.add_argument("--start-path", required=False, default=".", help=argparse.SUPPRESS)
    parser.add_argument("--snapshot-name", required=False, help="e.g. .snapshot|.snapshots|.zfs/snapshot")
    parser.add_argument(
        "--snapshot-location",
        required=False,
        choices=["top", "local"],
        help="is snapshot location at share level or like zfs, in every directory",
    )
    parser.add_argument("--prescan", default=False, action="store_true", help="Start scan before calculating hashes")
    parser.add_argument("--debug", action="store_true", required=False, help="add some debugging to output")
    parser.add_argument("--test", action="store_true", required=False, help=argparse.SUPPRESS)
    parser.parse_args()

    args = parser.parse_args()

    if (args.snapshot_name and not args.snapshot_location) or (args.snapshot_location and not args.snapshot_name):
        print("Error: --snapshot-name and --snapshot-location must both be defined or neither.", file=sys.stderr)
        # parser.print_help(sys.stderr)
        sys.exit(1)

    if args.test:
        unittest.main(argv=["first-arg-is-ignored"], exit=True)

    if not check_valid_volume(args.volume):
        print(f"Invalid volume name '{args.volume}'", file=sys.stderr)
        sys.exit(1)

    conn = get_connection()
    cur = conn.cursor()
    q1 = f"""
    WITH sizes AS (SELECT f.size
    FROM sf.file_current f
    INNER JOIN sf_volumes.volume v on v.id = f.volume_id
    WHERE v.name = '{args.volume}'
      AND f.size > {args.cutoffMIB * 1024 * 1024}
    GROUP BY f.size
    HAVING count(f.size) > 1)
    SELECT d.path || '/' || f.name
    FROM sf.dir_current d
    INNER JOIN sf.file_current f ON d.id = f.parent_id and d.volume_id = f.volume_id
    INNER JOIN sizes s ON s.size = f.size
    INNER JOIN sf_volumes.volume v on v.id = f.volume_id
    WHERE v.name = '{args.volume}'
      AND f.size > {args.cutoffMIB * 1024 * 1024}
    """

    print("finding pairs of similarly sized files")
    if args.debug:
        print(q1, file=sys.stderr)

    cur.execute(q1)

    linecount = 0

    with tempfile.NamedTemporaryFile(prefix="stage1a.out.", mode="wb") as of1:
        for row in cur:
            of1.write(row[0].encode("utf-8") + b"\0")
            linecount += 1
        of1.flush()

        if linecount == 0:
            print("no pairs of files with same size match cutoff criteria. Consider lowering cutoff")
            sys.exit(1)
        else:
            print(f"found {linecount} files to scan")

        print("running hash on duplicate set")

        job_cmd_parts = [_get_sf_path("bin/examples/job/hashsnap.py"), "--paths-via-stdin"]
        if args.snapshot_location:
            job_cmd_parts.append("--snapshot-location")
            job_cmd_parts.append(shlex.quote(args.snapshot_location))
        if args.snapshot_name:
            job_cmd_parts.append("--snapshot-name")
            job_cmd_parts.append(shlex.quote(args.snapshot_name))
        job_cmd = " ".join(job_cmd_parts)
        sf_cmd = [
            "sf",
            "job",
            "start",
            "--wait",
            "--prescan" if args.prescan else "--no-prescan",
            "--job-name=hash",
            "--paths-via-stdin",
            "--cmd-output-format=json",
            f"--from-file0={of1.name}",
            "--type=f",  # sql produce only files, so that dispatcher does not look for files in directory table
            "--workers-per-agent=6",
            job_cmd,
            f"{args.volume}:",
        ]
        os.execvp(sf_cmd[0], sf_cmd)
