#!/opt/starfish/examples/venv/bin/python3
"""
***********************************************************************************************************

 Starfish Storage Corporation ("Starfish") CONFIDENTIAL
 Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.

 NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
 Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
 intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
 U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
 Dissemination of this information or reproduction of this material is strictly forbidden unless prior
 written permission is obtained from Starfish. Access to the source code contained herein is hereby
 forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
 confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
 Starfish's software.

 ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
 THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
 AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
 FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
 DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
 WHOLE OR IN PART.

 FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
   These notices shall be marked on any reproduction of this data, in whole or in part.
   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
   Software clause at DFARS 52.227-7013.

***********************************************************************************************************
"""

###############################################################################
#  Author Doug Hughes, Ken Carlile, Jim Lester
#  Last modified 2023-11-20
#
# Compare directory trees using SQL wrapped with a little Python
#
# WARNING: This script runs queries directly against the Starfish database
# and may use up memory or become outdate if there is a datbase schema change
#
# 2019-11-07: created and committed
# 2019-11-12: fix source/dest swap in build_query and reference
# 2019-11-14: add options to scan source and scan target
# 2019-12-04: cleanup flake8 and add better help note on usage
# 2020-01-19: add --verbose and --quiet flags for output
# 2020-07-15: more efficiency added by Ken Carlile
# 2021-03-25: Bug fix for when no directories to copy and minor output changes
# 2021-03-29: Fixes for handling spaces in starting directories
# 2021-04-06: better performance in query, remove double /, add --no-post-verification, add pid to tmp files
# 2021-04-14: transient sever 100X performance regression in query planner when using OR
#             also print files in verbose mode
# 2021-04-16: don't run job if file is empty
# 2021-05-26: correct issues when source is at root of volume; fix tests
# 2021-07-09: updated to use python virtual env now included with sf-examples
# 2021-07-20: add logic and test cases to determine bad source path and exit
# 2022-01-11: give larger batches for better efficiency when doing remove
# 2022-01-26: turn on autocommit to avoid idle_in_transaction timeout and enable readonly for safety
# 2022-08-18: Refactor and fix pylint warning: 'redefined-outer-name'
# 2023-06-02: use new volume to id function and ancestor_ids
# 2023-08-05: use more descriptive job name
# 2023-11-20: add capability to defer removal of things on the destination containing a tag
# version 1.7

import argparse
import configparser
import json
import os
import re
import subprocess
import sys
import tempfile
import unittest
from argparse import RawTextHelpFormatter

import psycopg2


class BadSrcPath(Exception):
    """custom exception raised when src volpath is bad"""


##############################################################################
# Unit tests
##############################################################################


class TestQ(unittest.TestCase):
    """extension for unittest framework."""

    setup_done = False
    voltests = False

    def setUp(self):
        if TestQ.setup_done:
            return
        self.maxDiff = None
        if os.getuid() != 0:
            print("volume tests can only be run as root", file=sys.stderr)
            TestQ.voltests = False
        else:
            TestQ.voltests = True
            trymkdir("/tmp/s1")
            trymkdir("/tmp/d1/s2")
            trymkdir("/tmp/s1/sourceonlydir")
            trymkdir("/tmp/d1/s2/destonlydir/deep1/deep2/deep3")
            trymkdir('/tmp/d1/s2/destonlydir/deep1/deep4 with;!":$hi nasties')
            trymkdir("/tmp/d1/s2/destonlydir/deep1/deep5")
            with open("/tmp/d1/s2/destonlydir/deep1/deep5/file1", "w") as f:
                print("hello", file=f)
            with open("/tmp/d1/s2/destonlydir/deep1/deep2/deep3/file2", "w") as f:
                print("hello", file=f)
            trymkdir("/tmp/s1/same1/same2")
            trymkdir("/tmp/s1/same1/same3")
            trymkdir("/tmp/s1/same1/same4")
            trymkdir("/tmp/d1/s2/same1/same2")
            trymkdir("/tmp/d1/s2/same1/same3")
            trymkdir("/tmp/d1/s2/same1/same4")
            with open("/tmp/d1/s2/same1/same2/file1", "w") as f:
                print("hello", file=f)
            with open("/tmp/d1/s2/same1/file2", "w") as f:
                print("hello", file=f)
            with open("/tmp/s1/same1/same2/file1", "w") as f:
                print("hello", file=f)
            with open("/tmp/s1/same1/file2", "w") as f:
                print("hello", file=f)
            if os.system("sf volume show s1 > /dev/null 2>&1 ") != 0:
                print("adding volume s1")
                os.system("sf volume add s1 /tmp/s1")
            if os.system("sf volume show d1 > /dev/null 2>&1 ") != 0:
                print("adding volume d1")
                os.system("sf volume add d1 /tmp/d1")
            print("scanning source and destination volumes")
            os.system("sf scan start --wait s1 >/dev/null 2>&1")
            os.system("sf scan start --wait d1 >/dev/null 2>&1")
            os.system("sf tag add d1:/s2/destonlydir/deep1 keep")

        TestQ.setup_done = True

    def test_auth(self):
        """test connection"""
        print("testing auth")
        conn = psycopg2.connect(Pg.getpgauth())
        self.assertNotEqual(conn, None)

    def test_query(self):
        """test sql return"""
        print("testing a query")
        conn = psycopg2.connect(Pg.getpgauth())
        query = """select count(*) from sf_volumes.volume"""
        cur = conn.cursor()
        cur.itersize = 100000
        cur.execute(query)
        self.assertNotEqual(cur, None)
        rows = cur.fetchall()
        self.assertEqual(len(rows), 1)

    def test_srcpath(self):
        """test valid source paths"""

        self.assertTrue(srcpath_exists("home:doug"))
        with self.assertRaises(BadSrcPath):
            srcpath_exists("BadVol:")
        with self.assertRaises(BadSrcPath):
            print("check file")
            srcpath_exists("home:doug/.bashrc")
        with self.assertRaises(BadSrcPath):
            print("check emptydir")
            srcpath_exists("home:.emptydir")

    def test_diff(self):
        """test differences"""
        if not TestQ.voltests:
            return
        print("testing differences between source and destination")
        pgh_ = Pg()
        self.maxDiff = None

        dquery_, fquery_ = build_query("s1", "", "d1", "s2")
        flist = pgh_.runquery(fquery_)
        dlist = pgh_.runquery(dquery_)
        # assertCountEqual is poorly named, it asserts that two lists contain the same elements regardless of order
        self.assertCountEqual(
            dlist,
            [
                "destonlydir/deep1/deep2/deep3",
                "destonlydir/deep1/deep2",
                'destonlydir/deep1/deep4 with;!":$hi nasties',
                "destonlydir/deep1/deep5",
                "destonlydir/deep1",
                "destonlydir",
            ],
        )
        self.assertCountEqual(flist, ["destonlydir/deep1/deep2/deep3/file2", "destonlydir/deep1/deep5/file1"])

    def test_exclude(self):
        """test exclusion tag"""
        pgh_ = Pg()
        # args.exclude_tag = "keep"
        dquery_, _ = build_query("s1", "", "d1", "s2")
        dlist = pgh_.runquery(dquery_)
        self.assertCountEqual(
            dlist,
            [
                "destonlydir/deep1/deep2/deep3",
                'destonlydir/deep1/deep4 with;!":$hi nasties',
                "destonlydir/deep1/deep2",
                "destonlydir/deep1/deep5",
                "destonlydir/deep1",
                "destonlydir",
            ],
        )


##############################################################################
# Handle Postgres connections
##############################################################################


class Pg:
    """encapsulate postgres accesses"""

    def __init__(self):
        try:
            self._conn_ = psycopg2.connect(Pg.getpgauth())
            self._conn_.set_session(readonly=True, autocommit=True)
        except psycopg2.DatabaseError as e:
            print("unable to connect to the database: {}".format(str(e)), file=sys.stderr)
            sys.exit(1)

    @staticmethod
    def getpgauth():
        """pull auth info from config file to use implicitly"""
        try:
            if not os.access("/opt/starfish/etc/99-local.ini", os.R_OK):
                print(
                    "No access to /opt/starfish/etc/99-local.ini. You may need "
                    "to run this as Starfish user, as root, or add group read "
                    " permissions to this file.",
                    file=sys.stderr,
                )
                sys.exit(1)
            config = configparser.ConfigParser()
            config.read("/opt/starfish/etc/99-local.ini")
            return config.get("pg", "pg_uri")
        except OSError:
            print("can't read config file to get connection uri. check permissions.", file=sys.stderr)
            sys.exit(1)
        except Exception:
            print(
                "unable to read the pg auth information. "
                "This can only be run from the Starfish master, not an agent.",
                file=sys.stderr,
            )
            sys.exit(1)

    def runquery(self, query):
        """run a postgres query and print out results
        :returns: sorted list  of items to remove"""
        if args.debug:
            print("executing query " + query)

        retl = []

        cur = self._conn_.cursor()

        cur.execute(query)

        if cur.rowcount == 0:
            if not args.quiet:
                print("no items returned in query stage.", file=sys.stderr)

        rows = cur.fetchall()

        for row in rows:
            retl.append(row[0])

        return retl


##############################################################################
# Global subs
##############################################################################


def trymkdir(path):
    """wrap each makedir/file in a try to make setup go faster"""
    try:
        os.makedirs(path)
    except Exception:
        pass


def validate_volpath(volpath):
    """take a volume path like vol:path and split it into a volume and a path"""
    arr = [x.rstrip("/") for x in re.split(r":/?", volpath, maxsplit=1)]
    if len(arr) != 2:
        print("Volpath must be in the form of VOLUME:PATH", file=sys.stderr)
        sys.exit(1)
    return arr


def prescan(volpath):
    """launch a prescan of a volpath
    :returns: nothing"""
    # use subprocess for efficiency. Contributed by kdc 6/29/2020
    response = subprocess.run(["sf", "scan", "start", "--wait", "--quiet", volpath], check=False)
    if response.returncode:
        print("Error running prescan")
        sys.exit(1)


def build_query(src_vol, src_path, dst_vol, dst_path):
    """build a right outer join query
    :returns: file query string and dir query string"""

    main_select = """SELECT dt2.p2
                  FROM dt1 RIGHT OUTER JOIN dt2 ON p1 = p2
                  WHERE p1 IS NULL
                  """

    # If there's an exclusion tag for destination
    if args.exclude_tag:
        exclusion_condition = f"""
            AND NOT EXISTS (
                SELECT 1 FROM sf.tag_value_current tvc
                INNER JOIN sf.tag_name tn ON tvc.name_id = tn.id
                WHERE tn.name = '{args.exclude_tag}'
                AND tvc.volume_id = sf.volume_id_from_name('{dst_vol}')
                AND (
                tvc.fs_entry_id = d.id
                OR d.ancestor_ids && ARRAY[tvc.fs_entry_id]
            ))
        """
    else:
        exclusion_condition = ""

    # if somebody gives a root volume, we can't use right because it will just strip things
    # this does a substitution for the special case when the source path or dest path are empty
    if src_path == "":
        spath = "d.path"
    else:
        spath = f"right(d.path, -length('{src_path}') -1)"

    if dst_path == "":
        dpath = "d.path"
    else:
        dpath = f"right(d.path, -length('{dst_path}') -1)"

    dquery_ = f"""
        WITH dt1 AS (
            SELECT
              {spath} AS p1
              FROM sf.dir_current d
              WHERE d.volume_id = sf.volume_id_from_name('{src_vol}')
                AND d.ancestor_ids && ARRAY[(
                    SELECT id from sf.dir_current
                    WHERE path = '{src_path}' AND volume_id = sf.volume_id_from_name('{src_vol}')
                )]
                AND {spath} != ''       -- safety check
        ), dt2 AS (
            SELECT
              {dpath} AS p2
              FROM sf.dir_current d
              WHERE d.volume_id = sf.volume_id_from_name('{dst_vol}')
                AND d.ancestor_ids && ARRAY[(
                    SELECT id from sf.dir_current
                    WHERE path = '{dst_path}' AND volume_id = sf.volume_id_from_name('{dst_vol}')
                )]
                AND {dpath} != ''       -- safety check
                {exclusion_condition}
          )
          {main_select}
        -- directories need to be ordered depth first for clean removal
        ORDER BY array_length(string_to_array(p2, '/'), 1) DESC
        """

    # if somebody gives a root volume, we can't use right because it will just strip things
    # this does a substitution for the special case when the source path or dest path are empty
    if src_path == "":
        spath = "d.path || '/' || f.name"
    else:
        spath = f"right(d.path, -length('{src_path}') -1) || '/' || f.name"

    if dst_path == "":
        dpath = "d.path || '/' || f.name"
    else:
        dpath = f"right(d.path, -length('{dst_path}') -1) || '/' || f.name"

    fquery_ = f"""
        WITH dt1 AS (
          SELECT
          {spath} AS p1
          FROM sf.dir_current d
            INNER JOIN sf.file_current f ON f.parent_id = d.id
              AND f.volume_id = d.volume_id
            WHERE f.volume_id = sf.volume_id_from_name('{src_vol}')
              AND d.ancestor_ids && ARRAY[(
                  SELECT id from sf.dir_current
                  WHERE path = '{src_path}' AND volume_id = sf.volume_id_from_name('{src_vol}')
              )]
              AND {spath} != ''       -- safety check
        ), dt2 AS (
          SELECT
          {dpath} AS p2
          FROM sf.dir_current d
            INNER JOIN sf.file_current f ON f.parent_id = d.id
              AND f.volume_id = d.volume_id
            WHERE f.volume_id = sf.volume_id_from_name('{dst_vol}')
              AND d.ancestor_ids && ARRAY[(
                  SELECT id from sf.dir_current
                  WHERE path = '{dst_path}' AND volume_id = sf.volume_id_from_name('{dst_vol}')
              )]
              AND {dpath} != ''       -- safety check

        )
        {main_select}
        """

    return dquery_, fquery_


def srcpath_exists(src_path):
    """verify the src path exists and is a directory. Else error.
    :returns: True if okay (exception  on failure)"""

    qresponse = subprocess.run(
        ["sf", "query", src_path, "--root-only", "--json"],
        check=False,
        capture_output=True,
    )

    err_msg = (
        f"The source path specified: '{src_path}' does not exist or is empty. \n"
        "The result of running clean_target on this path would be complete \n"
        "destruction of all files and dirs on the destination. If this is \n"
        "the desired result, please use a remove job instead.\n"
    )

    if qresponse.returncode:
        print(err_msg, file=sys.stderr)
        raise BadSrcPath()
    try:
        js = json.loads(qresponse.stdout)
    except Exception:
        print("internal error parsing query results", file=sys.stderr)
        raise BadSrcPath()
    if len(js) == 0:
        print(qresponse.returncode)
        print(err_msg, file=sys.stderr)
        raise BadSrcPath()
    if js[0]["type"] != 16384:
        err_msg = "The source path path specified is not a directory. Exiting."
        print(err_msg, file=sys.stderr)
        raise BadSrcPath()
    if js[0]["rec_aggrs"]["files"] == 0:
        err_msg = "The source path has no files. Exiting."
        print(err_msg, file=sys.stderr)
        raise BadSrcPath()

    return True


def run_remove(jobargs, filepath, dst_vol):
    """run a directed file or directory remove job"""

    dresponse = subprocess.run(
        [
            "sf",
            "job",
            "start",
            "--job-name",
            "clean_target",
            "--wait",
            "--quiet",
            "--no-post-verification",
            "--no-prescan",
            "--batch-size-entries",
            str(10000),
            "--batch-size-bytes",
            str(250 * 1000 * 1000 * 1000),
            *jobargs,
            "--from-file",
            filepath,
            "remove",
            f"{dst_vol}:",
        ],
        check=False,
    )
    if dresponse.returncode:
        print("Error running removal job")
        print(f"saved remove file at {filepath}")
        sys.exit(1)


###############################################################################
# Main
###############################################################################

if __name__ == "__main__":
    examples_text = (
        "Examples: \n"
        "Compare source and target and report on files that exist on target but not on source.\n "
        "Assumes that scans were recently completed.\n "
        "  ./clean-target.py --dry-run SFVol1: SFVol2: \n\n "
        "Scan source and target and then report on files that exist on target but not on source.\n "
        "  ./clean-target.py --dry-run --prescan-source --prescan-target SFVol1: SFVol2: \n\n "
        "Run sfcopy, scan target, and delete files on target that no longer exist on source.\n "
        "  sf job start --wait 'copy' SFVol1: SFVol2:\n "
        "  ./clean-target --prescan-target SFVol1: SFVol2:\n "
        "\n"
        "[1mNotes:[0m \n"
        "* When using source and destination volpaths, there should be no / after the :\n"
        "  Right: SFVol1:home/cthuhlu\n"
        "  Wrong: SFVol1:/home/cthuhlu\n"
    )
    # Parse Arguments
    parser = argparse.ArgumentParser(
        description="Starfish utility to replicate deletions after a copy job has run.\n",
        epilog=examples_text,
        formatter_class=RawTextHelpFormatter,
    )
    parser.add_argument("--debug", action="store_true", required=False, help="add some debugging to output")
    parser.add_argument("--dry-run", action="store_true", required=False, help="show what would be removed")
    parser.add_argument(
        "--prescan-source", "--prescan-src", action="store_true", required=False, help="prescan the source"
    )
    parser.add_argument(
        "--prescan-target", "--prescan-dest", action="store_true", required=False, help="prescan the target"
    )
    parser.add_argument(
        "--nowarn", action="store_true", required=False, help="suppress warning message about memory use"
    )
    parser.add_argument("--verbose", action="store_true", required=False, help="show all removal operations")
    parser.add_argument("--quiet", action="store_true", required=False, help="don't print anything")
    parser.add_argument("--test", action="store_true", required=False, help=argparse.SUPPRESS)
    parser.add_argument(
        "--exclude-tag", required=False, help="do not remove any directories on destination with given tag"
    )
    parser.add_argument("VOLPATH1", help="source volume and path to compare")
    parser.add_argument("VOLPATH2", help="target volume and path to compare")
    parser.parse_args()

    args = parser.parse_args()

    pgh = Pg()

    if args.test:
        unittest.main(argv=["first-arg-is-ignored"], exit=True)
        sys.exit(0)

    if not args.nowarn:
        print(
            "WARNING: This may use up a large amount of memory in Postgres if run "
            " against very large directory trees. Disable this warning with --nowarn",
            file=sys.stderr,
        )

    # sanity check input volpath to make sure it exists
    try:
        srcpath_exists(args.VOLPATH1)
    except Exception:
        sys.exit(1)

    srcvol, srcpath = validate_volpath(args.VOLPATH1)
    dstvol, dstpath = validate_volpath(args.VOLPATH2)

    if args.verbose and args.quiet:
        print("only one of verbose and quiet may be specified")
        sys.exit(1)

    # do prescans if needed
    if args.prescan_source:
        print("scanning source volpath: " + args.VOLPATH1)
        prescan(args.VOLPATH1)
    if args.prescan_target:
        print("scanning dest volpath: " + args.VOLPATH2)
        prescan(args.VOLPATH2)

    mpq = f"""
        SELECT m.path from sf_volumes.mount m
        INNER JOIN sf_volumes.volume v
        ON m.volume_id = v.id
        WHERE v.name = '{dstvol}'
        """

    mountpoint = pgh.runquery(mpq)[0]

    dquery, fquery = build_query(srcvol, srcpath, dstvol, dstpath)
    # Rewrote this section to use sf remove instead of os remove calls. 6/30/20 KDC
    if args.dry_run:
        print("Dry run - Building file list")
        for filename in pgh.runquery(fquery):
            print(f"{dstpath}/{filename.lstrip('/')}")
        print("\nDry run - Building directory lists")
        for dirname in pgh.runquery(dquery):
            print(f"{dstpath}/{dirname.lstrip('/')}")
        sys.exit(0)

    else:
        # There's a fair bit of redundancy here that appears to be refactorable but the refactoring makes
        # it less readable over all.
        # 1) Using a function to differentiate means passing in at least 6 variables
        #    to do it properly (the query descriminator, pgh handle, query itself, whether it's a dir or file, etc.
        # 2) Using a for loop requires fewer variables, but building up a lot of indirection by saving the
        #    sf query differece (--not), lots of if statements to handle each kind (file vs dir for printing info
        #    the sql query, and the filename which we often want to leave in the filesystem for examination
        #
        # so, just cleaning this up a bit and normalizing and using tempfile seems to be the most readable thing to do
        # also using a small function where the number of variables isn't large

        filesfile = tempfile.NamedTemporaryFile(prefix=f"clean_target_{dstvol}_files_", delete=False, mode="w")
        if not args.quiet:
            print("\nBuilding list of files ...")
        for filename in pgh.runquery(fquery):
            if args.verbose:
                print(f"{dstpath}/{filename.lstrip('/')}")
            filesfile.write(f"{dstpath}/{filename.lstrip('/')}\n")

        filespath = filesfile.name
        filesfile.close()

        if not args.quiet:
            print("Starting file remove job")
        if os.stat(filespath).st_size > 0:
            run_remove(["--not", "--type", "d"], filespath, dstvol)
        # save if in debug
        if not args.debug:
            os.remove(filespath)
        else:
            print(f"temporary file of files saved to {filespath}")

        # mostly the same, but for dirs, tweaks in query sql filter, sf filter, name, etc.

        dirsfile = tempfile.NamedTemporaryFile(prefix=f"clean_target_{dstvol}_dirs_", delete=False, mode="w")
        if not args.quiet:
            print("\nBuilding list of directories ...")
        for dirname in pgh.runquery(dquery):
            if args.verbose:
                print(f"{dstpath}/{dirname.lstrip('/')}")
            dirsfile.write(f"{dstpath}/{dirname.lstrip('/')}\n")

        dirspath = dirsfile.name
        dirsfile.close()

        if not args.quiet:
            print("Starting directory remove job")
        if os.stat(dirspath).st_size > 0:
            run_remove(["--type", "d"], dirspath, dstvol)
        # save if in debug
        if not args.debug:
            os.remove(dirspath)
        else:
            print(f"temporary file of dirs saved to {dirspath}")

    if not args.quiet:
        print("Finished clean_target successfully")
