#!/opt/starfish/examples/venv/bin/python3
"""
***********************************************************************************************************

 Starfish Storage Corporation ("Starfish") CONFIDENTIAL
 Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.

 NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
 Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
 intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
 U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
 Dissemination of this information or reproduction of this material is strictly forbidden unless prior
 written permission is obtained from Starfish. Access to the source code contained herein is hereby
 forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
 confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
 Starfish's software.

 ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
 THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
 AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
 FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
 DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
 WHOLE OR IN PART.

 FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
   These notices shall be marked on any reproduction of this data, in whole or in part.
   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
   Software clause at DFARS 52.227-7013.

***********************************************************************************************************
"""

import argparse
import atexit
import configparser
import json
import logging
import logging.handlers
import os
import signal  # signal_checker:ignore
import subprocess
import sys
import tempfile
import time
from datetime import datetime, timedelta
from pprint import pprint
from subprocess import PIPE, STDOUT
from typing import Optional
from urllib.parse import quote

import requests
import urllib3
from requests.adapters import HTTPAdapter

# client sites don't need this, so avoid exceptions
try:
    import unittest
    from unittest import mock  # pylint: disable=W0611  # noqa F401
except ImportError:
    pass

try:
    # Python 3
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
except Exception:
    pass

########################################################
version = "1.3.28"


# 2023-10-03: ddzialak
# Fix handling "excludes" from global scope
# 2023-10-03: Piotr Dybowski
# Move scanlog inside SFHOME dir
# version = "1.3.27"
# 2023-03-24: Piotr Dybowski
# Skip monitored volumes
# 2023-03-08: Tomasz Dudziec
# Added exponential backoff for SFApi failing requests
# In that way there is no flood of failing logs when server is down
# version = "1.3.25"
# 2022-10-18: Krystian Dowolski
#  Fixed exception handling
# dispatch scan jobs by agent in parallel and in a loop
# 2022-09-02: Doug Hughes
#  fix honoring of disabled for volume properly when doing HUP
# version = "1.3.23"
# 2022-08-18: Piotr Dybowski
#  Refactor and fix pylint warning: 'redefined-outer-name'
# version = "1.3.22"
# 2022-03-10: Doug Hughes
#  scan timeout too short for Cornell WCM - make it more dynamic
# version = "1.3.21"
# 2022-01-24: Doug Hughes
# add extra_scan_args capability for scan args
# version = "1.3.20"
# 2021-07-08: Dave Gold
# * DG: Change to venv path instead of downloading python3.6
# version = "1.3.19"
# 2020-12-21: Dave Gold (Doug Hughes, David English previous commits)
# * DH: fix up flake8 modulo formatting
# version = "1.3.18"
# * DG: spruce up help; DH: add internal volume parallelism (incomplete)
# version = "1.3.17"
# * DH: add description of dispatchscan.ini location, also update of dispatchscan.ini text
# version = "1.3.16"
# * DH: add num_workers configurable
# version = "1.3.15"
# * DH: add try/catch around logger because there may be a race condition on trying to log when already
# inside a signal handler
# version = "1.3.14"
# * DH: add exclusion capability for scan
# version = "1.3.13"
# * DH: USR1 now prints active queue of pending scans by agent in order
# version = "1.3.12"
# * DH: make timestamps in logs conform to sf collect-logs standard format
# version = "1.3.11"
# * DH: add ability to override mtime_count on a per volume basis and add a disabled flag
# version = "1.3.10"
# * DH: additional debugging for issue of renaming a volume; add a set of scan oof volume on scan complete
# version = "1.3.9"
# * DH: add a startup delay for auto-start in supervisord
# version = "1.3.8"
# * DH: refactor how to get scan_history, and put scan history un method, and refactor active scans
# version = "1.3.7"
# * DH: do not clear scanhistory on loop. Count number of items returned
# version = "1.3.6"
# * DH: do not overwrite an entry in the volume scan history with 0 if already there
# version = "1.3.5"
# * DH: add exit handler and logging; standardize start and stop; change USR1 to be log.warning
# version = "1.3.4"
# * DH: add more details to catch failed scans
# version = "1.3.3"
# * DH: bug fix for creation time when vol moves between agents
# version = 1.3.2
# * DH: add --log-level argument and change startup notice to WARNING
# version = 1.3.1
# * DH: fix bug that causes too many scans to start (reset active dict, fix count)
# version = 1.3.0
# * DH: improved/reordered debugging
# * make minimum scan log entries to fetch = #vols * 2
# * allow volumes to move between agents
# * totally revamp getting ordered list (vols_by_ctime => vols_by_lastscan
# ** fix scanned_too_soon calcs
# ** fix volume failed
# ** revamp scan list into method instead of main (retry_scans)
# ** turn activescans into set
# * revamp logging to show scan loops
# * revamp cfg to allow set of mtime_count and parallel_per_agent while allowing cli override
# * change log file to dispatchs_scan.log to allow collection by sf collect-logs
# version = 1.2.8
# * DE: SfApi class for api management
# version = 1.2.7
# * DH handle logrotate automatically
# version = 1.2.6
# * DH: handle network and service errors contacting starfish rest services
# version = 1.2.5
# * DH: handle mtime_count = 0
# * work without dispatchscan.ini
# * rearrange logging
# * scan history tracking
# version = 1.2.4
# * DH: add signal handler to restart after zeroing out the failed volumes (HUP)
# * add signal to dump currently failed volumes (USR1)
# * fix fail handler
# version = 1.2.3
# * DH: handle config errors with getting sleep time by skipping
# version = 1.2.2
# * DH: better debugging output. nested printing, easier to read. bolding
# * fixed issue with agents scanning all volumes by making the saved ctimes per agent
# version = 1.2.1
# * DH: fix for scan service returning rc != 200 (retries)
# version = 1.2
# * DH: add /opt/starfish/etc/dispatchscan.ini
# * add ability to allow a maximum number of scans for a volume in a given number of hours
# * add global retries flag
#
# 2019-02-09
# DH: version = 1.1.1
#
# * add expoential backoff on failure of a scan.
# * add better scan failure detection
# * add more debugging up to level4
# * add -d alias


def till_forever():
    while True:
        yield True


logger = logging.getLogger(__name__)

SFHOME = os.environ.get("SFHOME", "/opt/starfish")


def config_str_to_list(value):
    return [item.strip() for item in value.split(",")]


class SfApi:
    """encapsulate all the API calls in a class"""

    def __init__(self, host, user, secret, debug_level=1, request_retry_strategy=None):
        if host == "0.0.0.0":
            raise ValueError(f"Invalid Starfish host: {host}")
        self.host = host
        self.username = user
        self.password = secret
        self.__session = None
        # if a response takes this many seconds, sleep for a minute
        self.high_usage_response_time = 30
        self.timeout = 5
        self.request_sleep = 10
        self.debug_level = debug_level
        self.request_retry_strategy = request_retry_strategy() if request_retry_strategy else till_forever()

    @property
    def _session(self):
        if self.__session is None:
            retries = urllib3.Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
            self.__session = requests.session()
            self.__session.mount("http://", HTTPAdapter(max_retries=retries))
            self.__session.mount("https://", HTTPAdapter(max_retries=retries))
        return self.__session

    def _request(self, method, endpoint, body=None):
        # 200 = OK
        # 400 - request problem, scan should not start
        # 404 - scan not found
        # 409 - conflict, overlapping scan
        # In case of timeout you will not have response code.
        url = f"https://{self.host}:443/api/{endpoint}"
        while next(self.request_retry_strategy):
            if method == "GET":
                if self.debug_level > 4:
                    print("  in _request GET for " + url + " " + str(body))
                try:
                    resp = self._get(url)
                except requests.RequestException as exc:
                    # Treat a timeout for a GET as a server error
                    logger.error(f"Request to {url} has failed with {exc}")
                    resp = requests.Response()
                    resp.status_code = 500
                    if self.debug_level > 1:
                        print(f"HTTP GET timeout for {url}")
                    logger.warning(f" HTTP GET timeout ({self.timeout} s)for {url}")
                    # try larger timeout, up to a point
                    if self.timeout < 30:
                        self.timeout += 1
                else:
                    if resp.ok:  # pylint: disable=R1723
                        break
                    else:
                        logger.error("  %s failed for %s : %s", method, url, str(body))
                        logger.error(f"Request to {url} has failed with response {resp}")
                        sleeptime_ = self.request_sleep
                        time.sleep(sleeptime_)
            elif method == "POST":
                if self.debug_level > 4:
                    print("  in _request POST " + url + " " + str(body))
                try:
                    resp = self._post(url, body)
                except requests.RequestException as exc:
                    # Treat a timeout for a POST as OK
                    logger.error(f"Request to {url} has failed with {exc}")
                    resp = requests.Response()
                    resp.status_code = 200
                    if self.debug_level > 1:
                        print("HTTP POST timeout for " + url)
                    logger.warning(" HTTP POST timeout for %s", url)
                if resp.ok:  # pylint: disable=R1723
                    break
                else:
                    logger.error("  %s failed for %s : %s", method, url, str(body))
                    logger.error(f"Request to {url} has failed with response {resp}")
                    sleeptime_ = self.request_sleep
                    time.sleep(sleeptime_)
            else:
                raise ValueError('method must be "GET" or "POST"')
        if resp.elapsed > timedelta(seconds=self.high_usage_response_time):
            logger.warning("Response takes many seconds, sleep for a minute")
            time.sleep(60)
        return resp

    def retry_wait(self, body):
        """variable retry wait (by volume or default)"""

    def _get(self, url):
        if self.debug_level > 4:
            print("   in _get")
        return self._session.get(url, auth=(self.username, self.password), timeout=self.timeout, verify=False)

    def _post(self, url, body):
        if self.debug_level > 4:
            print("   in _post")
        return self._session.post(
            url,
            auth=(self.username, self.password),
            json=body,
            timeout=self.timeout,
            verify=False,
        )

    def query(self, qstr, volume=None):
        """:returns: query search result"""
        if volume:
            qres = self._request("GET", f"query/{volume}/?query={quote(qstr)}")
        else:
            qres = self._request("GET", f"query/?query={quote(qstr)}")
        if qres:
            return qres.json()
        else:
            return None

    def get_volumes(self):
        """:returns: a volume query result"""
        return self._request("GET", "volume").json()

    def get_active_scans(self):
        """:returns: a list of active scans query result"""
        return self._request("GET", "scan?running=true").json()

    def get_recent_scan_list(self, limit):
        """:returns: a list of recent scans query result"""
        return self._request("GET", f"scan?state=done&sort_order=-1&limit={limit}").json()


class Agents:  # pylint: disable=R0902
    """handle aspects of agent based automatic scanning"""

    def __init__(self):
        self._username_ = "starfish"
        # self._logger_ = logger
        # number of scans to look back
        self.scan_history = cfg.get(volume=None, param="scan_history")
        if self.scan_history is None:
            self.scan_history = 100
        else:
            self.scan_history = int(self.scan_history)
        # count scans per volume for mtime scan tweaks
        self._volscancount_ = {}
        # parse configs and prepare rest auth strings
        config = configparser.ConfigParser(strict=False)
        config.read(f"{SFHOME}/etc/01-service.ini")
        config.read(f"{SFHOME}/etc/99-local.ini")
        self._secret_ = config.get("global", "secret_key")
        apihost = config.get("global", "bind_host")
        if apihost == "0.0.0.0":
            apihost = "localhost"
        self._apihost_ = apihost
        # assigned in get_vols
        self.agentlist = {}
        # default agent
        self.vol_to_agent = {}
        # all agents
        self.vol_to_agents = {}
        self.lastscan = {}
        # mark failed scans
        self.failed = {}
        # active scans per agent
        self.active_scans_by_agent = {}

        secret = config.get("global", "secret_key")
        self.api = SfApi(self._apihost_, self._username_, secret, debug_level=args.debug)

    def reset_failed(self):
        """clear out the failed volumes and retry them
        :returns: nothing"""
        self.failed.clear()
        cfg.read_config()

    def set_failed(self, vol):
        """sets failed status on volume so it won't be scanned again while this is running
        :returns: nothing"""
        self.failed[vol] = True

    def get_failed(self):
        """:returns: list of all volumes marked as afailed"""
        return self.failed.keys()

    def is_failed(self, vol):
        """check if this volume is marked failed and skip it
        :returns: True|False"""
        if args.debug > 3:
            print("  checking failed status of " + vol, end="")
        if vol in self.failed:
            if args.debug > 3:
                print("; True")
            return True
        else:
            if args.debug > 3:
                print("; False")
            return False

    def get_vols(self):
        """:returns: set of agents and dict of vols to agent"""
        volumes = self.api.get_volumes()
        not_monitored_volumes = [v for v in volumes if v.get("user_params", {}).get("fs_monitor_type") is None]
        for v in not_monitored_volumes:
            agent_ = str(v["default_agent_address"])
            vol = str(v["vol"])
            # skip failed volumes
            if self.is_failed(vol):
                continue
            if args.debug > 3:
                print(f"adding {vol} to agent {agent_}")
            if agent_ not in self.agentlist:
                self.agentlist[agent_] = {vol}
            else:
                self.agentlist[agent_].update([vol])
            self.vol_to_agent[vol] = agent_
            self.vol_to_agents[vol] = list((v["mounts"]).keys())

        return self.agentlist, self.vol_to_agent

    def get_all_agents(self):
        """:returns: list of all agents"""
        return list(self.agentlist.keys())

    def get_volume_agents(self, vol):
        """:returns: list of agents available for a given volume"""
        return self.vol_to_agents[vol]

    def get_scan_history(self):
        """go through the current scan list history and pick out things that are not valid
        also update scan start times; they are delivered in order with most recent scan first
        :updates: self.lastscan
        :returns: Nothing"""

        scan_list = self.api.get_recent_scan_list(self.scan_history)
        if args.debug > 2:
            scan_list_len = len(scan_list["scans"])
            print(f"{str(scan_list_len)} items returned in get_recent_scan_list")

        # scan_list is a hodge podge of json things, often the same volume multiple times
        # but at least it is sorted
        # make a set to quickly skip ones we've already checked; record the newest date
        scan_loop_skip = set()

        for scan in scan_list["scans"]:
            creation_time = int(scan["creation_time"])
            vol = scan["volume"]
            if args.debug > 4:
                print(f"  scan_history: {vol} = {creation_time}")
            # finally, store in dict of last scan times if newer
            if vol not in self.lastscan or self.lastscan[vol] < creation_time:
                self.lastscan[vol] = creation_time
            # this is an optimization to cut down on messages when debugging
            if vol in scan_loop_skip and self.lastscan[vol] != 0:
                if args.debug > 4:
                    print(f"  skipping {vol} from scan_list")
                continue
            scan_loop_skip.add(vol)

    def is_disabled(self, vol):
        """check if volume is disabled
        :returns: True|False"""
        # check if disabled in config
        is_disabled = cfg.get(volume=vol, param="disabled")
        if args.debug > 2:
            print(f"checking if {vol} is disabled ({is_disabled})")
        if is_disabled is not None:
            if is_disabled in ["True", "true"]:
                return True
        return False

    def vols_by_lastscan(self, filteragent):
        """
        requires: run get_vols first, filter by scanagent
        :returns: list of volumes sorted by creation time, dictionary of volume and last run
        """
        # clear it out and refetch commented out 2020-02-03 - dh
        # self.lastscan.clear()
        if args.debug > 2:
            print(f"available vols to scan for agent {filteragent}: {str(self.agentlist[filteragent])}")

        # sometimes volumes move between agents at an active site, we'll pick out failed volumes
        for vol in self.agentlist[filteragent]:
            # we might not have any scan record of it; set these as oldest
            # but don't override if there was already a recent scan
            # get_vols checks failed status, so no need to repeat it
            # if not self.is_failed(vol) and vol not in self.lastscan:
            if vol not in self.lastscan:
                self.lastscan[vol] = 0

        # get sorted list of all volumes by lastscan (orderinging by scan start time)
        vol_sorted = sorted(self.lastscan, key=self.lastscan.get, reverse=True)
        if args.debug > 4:
            print(f"sorted super-list: {vol_sorted}")
        # now go through them in order and remove items that aren't on this agent to get a final list
        vol_allowed = []
        for vol in vol_sorted:
            if self.is_disabled(vol):
                continue
            if vol in self.agentlist[filteragent]:
                # double check failed here because vol_sorted includes extras
                if self.is_failed(vol):
                    continue
                # check if already running
                if filteragent in self.active_scans_by_agent and vol in self.active_scans_by_agent[filteragent]:
                    continue
                # if any one of the scan history is too soon, then the vol is too soon
                if self.scanned_too_soon(vol):
                    continue
                # it's good to go
                vol_allowed.append(vol)

        # get it with oldest last for pop
        if args.debug > 1:
            print(f"  volume list by newest to oldest is {vol_allowed}")
        if args.debug > 3:
            for vol in vol_allowed:
                ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.lastscan[vol]))
                print(f"vol:{vol:>40}\t{ts}")

        return vol_allowed

    def active_scans(self):
        """:returns: dictionary of active scans and agent and set of active scans"""
        scan_list = self.api.get_active_scans()
        # reset active_scans each call
        self.active_scans_by_agent.clear()
        if args.debug > 4:
            print("active scans: " + str(scan_list))
        for scan in scan_list["scans"]:
            vol = scan["volume"]
            agent_ = scan["agent_address"]
            if args.debug > 3:
                print(f"found active scan on agent {agent_} for volume {vol}")
            if agent_ not in self.active_scans_by_agent:
                self.active_scans_by_agent[agent_] = [vol]
            else:
                self.active_scans_by_agent[agent_].append(vol)

        if args.debug > 2:
            print(f"active_scans returning {self.active_scans_by_agent}")

    def scanned_too_soon(self, vol):
        """determine if this volume was scanned within any holdoff period
        as determined by dispatchscan.ini
        If there is a limit there set for the volume as volume_scan_hours_limit,
        then that limit governs the minimum time between scans
        :returns: True|False"""
        red = "\033[1;31m"
        green = "\033[1;32m"
        no_color = "\033[0m"

        if args.debug > 0:
            print(f"   \033[1m{vol}\033[0m: checking if volume is okay to scan")
        now = datetime.now()
        # it might not be there, then that's a False (not scanned too soon)
        if vol not in self.lastscan:
            return False

        volhours = cfg.get(vol, "volume_scan_hours_limit")
        if args.debug > 1:
            if volhours is not None:
                print(f"    scan hours minimum limit for volume {vol} is {str(volhours)}")
            else:
                print(f"    minimum scan limit for volume {vol} is not defined. (no volume_scan_hours_limit)")

        if volhours is None:
            return False
        if args.debug > 1:
            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.lastscan[vol]))
            print(f"    volume last scanned at {timestamp}")
        okay_to_scan_at = datetime.fromtimestamp(self.lastscan[vol]) + timedelta(hours=int(volhours))
        if args.debug > 0:
            print(f"    volume {vol} next okay to scan at \033[1m{str(okay_to_scan_at)}\033[0m now {str(now)}")
        if now > okay_to_scan_at:
            if args.debug > 1:
                print(f"{green}{vol}{no_color}     okay to scan")
            return False
        else:
            if args.debug > 1:
                print(f"{red}{vol}{no_color}    not okay to scan")
            return True

    def get_scan_type(self, vol):
        """pick a scan type based upon number of scans on the volume
        by default there will be 6 mtime scans per diff
        :requires: volume name
        :returns: scan type
        """

        # mtime_count might have been changed and need to be parsed again during a HUP
        mtime_cfg = cfg.get(vol, "mtime_count")
        if not mtime_cfg:
            # fallback to default args if not specified
            mtime_count = args.mtime_count
        else:
            mtime_count = int(mtime_cfg)

        if vol not in self._volscancount_:
            self._volscancount_[vol] = 0
            scan_type = "diff"
        else:
            if mtime_count == 0 or self._volscancount_[vol] % mtime_count == 0:
                scan_type = "diff"
            else:
                scan_type = "mtime"
        return scan_type

    def successful_scan(self, vol):
        """mark a scan as successful for scancount
        :returns: Nothing"""
        self._volscancount_[vol] += 1

    def subvolume_list(self, vol):
        """get list of dirs at top level of volume
        :returns: list of volpaths"""
        jslist = self.api.query("type=d depth=1", volume=vol)
        vollist = [item["vol_path"] for item in jslist]
        return vollist

    def start_scan(self, vol):
        """start a scan on volume
        :returns: nothing
        """
        scan_type = self.get_scan_type(vol)

        if args.debug > 0:
            print(f"     start_scan: {vol}")
        scanst = {"type": scan_type, "volume": vol, "requested_by": "dispatchscan"}
        excludes = cfg.get_vol_excludes(vol)
        if excludes:
            scanst["crawler_options"] = excludes
        if args.debug > 1:
            print(f"     starting with: {scanst}")
        scanres = None
        try:
            # could have failed in start
            scanres = requests.post(
                f"https://{self._apihost_}:443/api/scan/",
                auth=(self._username_, self._secret_),
                json=scanst,
                verify=False,
                timeout=60,
            )
            scandata = scanres.json()
        except Exception as exc:
            logger.error(
                " failed start scan of %s in pid %d: '%s'; scanreq='%s'",
                vol,
                os.getpid(),
                str(exc),
                str(scanst),
            )
            if scanres:
                logger.error("  scanres: %d", scanres)
            if args.debug:
                print(f"     failed start scan of {vol} in pid {os.getpid()}: '{str(exc)}'; scanreq='{str(scanst)}'")
                if scanres:
                    print(f"      scanres: {str(scanres)}")
            if scanres:
                return scanres.status_code
            else:
                return 503

        # If volume has been renamed, the old name still lingers in sf scan
        # list for the history, so this is to override that condition and
        # set the scan time on the volume so that we don't keep scanning it
        # over and over again
        self.lastscan[vol] = int(time.time())

        if "id" in scandata:
            logger.info(
                " starting %s scan of %s in pid %d with scanid %s: %s",
                scan_type,
                vol,
                os.getpid(),
                scandata["id"],
                scanres.status_code,
            )
        else:
            logger.error(
                " non-starting scan of %s with result %d: %s",
                vol,
                scanres.status_code,
                scanres.text,
            )
            if args.debug:
                print(f"      non-starting scan of {vol} with result: {scanres.status_code} {scanres.reason}")
            return scanres.status_code

        if args.debug:
            print(f"      scanres: {scanres}")

        self.successful_scan(vol)
        return scanres.status_code

    @staticmethod
    def vol_is_parallel(vol):
        """if volume has split_scan parameter set, then it's a parallel volume
        :returns True | False"""
        if cfg.get(volume=vol, param="split_scan") == "True":
            if args.debug > 1:
                print(f"  parallel scan {vol}")
            return True
        else:
            return False

    def retry_scans(self, vol):
        """try a scan a few times before marking it failed
        :returns True | False"""
        scancount = 0
        retries = cfg.get(scanvol, "retries")
        if retries:
            num_retries = int(retries)
        else:
            num_retries = 3

        while True:
            http_rc = self.start_scan(vol)
            # 200 = OK
            # 400 - request problem, scan should not start
            # 404 - scan not found
            # 409 - conflict, overlapping scan
            # In case of timeout you will not have response code.
            if http_rc not in (200, 409):
                # increasing retry intervals
                # This is backoff strategy to give 1,2,4,8 seconds delays between consecutive requests.posts
                sleeptime_ = 2**scancount
                scancount += 1
                logger.error(
                    " retrying (%d) scan of %s in %d seconds",
                    scancount,
                    scanvol,
                    sleeptime_,
                )
                if args.debug:
                    print(f"     retrying ({scancount}) scan of {scanvol} in {sleeptime_} seconds")
                time.sleep(sleeptime_)
            else:
                break
            if scancount >= num_retries:
                logger.error(
                    " failed starting scan  of %s %d times. Marking failed.",
                    scanvol,
                    num_retries,
                )
                agents.set_failed(scanvol)
                if args.debug:
                    print(f"    failed starting scan of {scanvol} {num_retries} times. Marking failed and skipping.")
                return False
        return True

    def run_scan(self, vol):
        """run a scan in a subprocess
        :returns: nothing
        """
        scan_type = self.get_scan_type(vol)
        if args.debug > 1:
            print("    run_scan: " + vol)
        with open(f"/{SFHOME}/tmp/scanfd.{vol}", "a") as scanlog:
            # print("name is " + nameg)
            logger.info(" starting scan of %s in pid %d", vol, os.getpid())
            command = (
                f"/usr/bin/sf scan start --num-workers {cfg.get_workers(vol)} -t {scan_type} "
                f"{cfg.get_extra_scan_args}"
            )
            subprocess.Popen(
                command + vol + ":",
                shell=True,
                stdout=scanlog,
                stderr=scanlog,
                close_fds=True,
            )
        # cleanup
        while True:
            try:
                os.wait()
            except ChildProcessError:
                break

    def _get_parallel_pathlist(self, vol):
        """get the subvolume list of paths, and the list of excludes
        reduce the paths by the exclude list if there are matches.
        this is imperfect if things aren't at the same level.
        :returns: the pathlist and remaining excludes (scan string)"""
        paths = set(self.subvolume_list(vol + ":"))
        path_excludes = cfg.get_vol_excludes(vol)
        excludes = set()
        if path_excludes:
            excludes = set([f"{vol}:{ex}" for ex in path_excludes])  # pylint: disable=R1718
        if args.debug > 1:
            print(f"path_excludes={str(path_excludes)}")

        reduced_paths = list(paths - excludes)
        # sorting is here just so we can get reasonable test passing
        # by getting 2 exceptions in the same order each time in the string
        remaining_excludes = " ".join([f"--ignore-dir '{d}'" for d in sorted(list(excludes - paths))])
        return reduced_paths, remaining_excludes

    def parallel_scan(self, vol):
        """execute a parallel scan in a subshell using gnu parallel
        gnu parallel will handle keeping the agents full and assigning
        the dirs to an agent"""
        # run twice the number of agents for 2 active scans per agent
        pathlist, ignore_str = self._get_parallel_pathlist(vol)
        pathfile = save_tmp_file(pathlist)
        agentlist_ = self.get_volume_agents(scanvol)
        agentcount = len(agentlist_)
        agentfile = save_tmp_file(agentlist_)

        if args.debug > 1:
            print(f"     dirs saved to {pathfile}")
            print(f"     agents saved to {agentfile}")
        scan_type = self.get_scan_type(vol)
        cmd = (
            f"parallel -j {agentcount * 2} --xapply -a {agentfile} -a {pathfile} "
            f"sf scan start --num-workers {cfg.get_workers(vol)} -t {scan_type} "
            f"--wait {ignore_str} --agent-address {{1}} {{2}} > /dev/null"
        )
        # run as a foreground, blocking process, typically ignore output
        logging.info(cmd)
        try:
            p = subprocess.Popen(cmd, stdout=PIPE, stderr=STDOUT, shell=True)
            for line in p.stdout:
                if args.debug:
                    print(str(line))
                else:
                    logger.info(line)
        finally:
            # cleanup
            os.remove(agentfile)
            os.remove(pathfile)


class Config:
    """read a dispatchscan config file.
    This sets up mechanisms to override defaults on a per volume basis
    """

    def __init__(self, **kwargs):
        self.cf_file = f"{SFHOME}/etc/dispatchscan.ini"
        self.defaults = {}
        self.volcfg = {}
        self.debug = 0

        # allow override the config file location for unit tests
        if "cfgfile" in kwargs:
            self.cf_file = kwargs["cfgfile"]

        self.read_config()

    def read_config(self):
        """read the dispatchscan.ini configuration file and update internals"""
        if not os.path.exists(self.cf_file):
            return
        try:
            config = configparser.ConfigParser(strict=False)
            config.read(self.cf_file)
        except Exception as exc:
            print("failure to read config file: " + str(exc), file=sys.stderr)
            print("continuing with defaults", file=sys.stderr)
            logger.error(f"failure to read config file: {exc}")
            logger.error("continuing with defaults")
            return

        for k in config["DEFAULT"]:
            self.defaults[k] = config.get("DEFAULT", k)

        for k in config.sections():
            if k == "DEFAULT":
                continue
            self.volcfg[k] = {}
            # merge globals
            for d, v in self.defaults.items():
                self.volcfg[k][d] = v
            for d in config[k]:
                self.volcfg[k][d] = config.get(k, d)

    def set_debug(self, dbg):
        """enable debugging in config parsing"""
        self.debug = dbg

    def get(self, volume, param):
        """get a config variable from the class instance
        :returns: variable indicated for volume or global default of same or None"""

        # check volume first (may not have an override)
        if self.debug > 2:
            print(f"    fetching param {param} for volume {volume}")
        if volume in self.volcfg and param in self.volcfg[volume]:
            if self.debug > 2:
                print("     fetch returning results: " + self.volcfg[volume][param])
            return self.volcfg[volume][param]
        elif param in self.defaults:
            if self.debug > 2:
                print("     fetch returning default results: " + self.defaults[param])
            return self.defaults[param]
        else:
            if self.debug > 2:
                print("     fetch returning no results")
            return None

    def get_int(self, volume, param, default=None) -> Optional[int]:
        str_value: Optional[str] = self.get(volume, param)
        if str_value is None:
            return default
        try:
            return int(str_value)
        except (ValueError, TypeError):
            logger.error(f"Config value {str_value} of param {param} for volume {volume} is not an integer")
            return default

    def get_extra_scan_args(self, vol):
        """:returns any extra/optional args passed to sf scan start"""
        scan_args = self.get(param="extra_scan_args", volume=vol)
        if scan_args:
            return scan_args
        else:
            return ""

    def get_workers(self, vol):
        """:returns number of workers defined for volume or default 16 if none found"""
        vol_workers = self.get(param="num_workers", volume=vol)
        if vol_workers:
            return int(vol_workers)
        else:
            return 16

    def get_vol_excludes(self, vol):
        """return an exclude list for scan start json via api"""
        excludes = self.get(param="excludes", volume=vol)
        if excludes:
            return {"ignore_dirs": config_str_to_list(excludes)}
        return None

    def as_dict(self):
        return self.volcfg

    def dump(self):
        """dump the config to the log file - for debugging"""

        if cfg.debug:
            print("Dumping config to log file")
        logger.info("Defaults:")
        for k, v in self.defaults.items():
            logger.info(f"{k} = {v}")
        logger.info("Overrides:")
        for vol in self.volcfg.keys():
            logger.info(f" {vol}")
            for k, v in self.volcfg[vol].items():
                logger.info(f"  {k} = {v}")


class TestDispatch(unittest.TestCase):
    """Unit test framework"""

    cmdargs = argparse.Namespace

    def setUp(self):
        with open("/tmp/ini1", "w") as tfile:
            print("[DEFAULT]", file=tfile)
            print("num_workers=12", file=tfile)
            print("excludes=excludes-from-defaults", file=tfile)
            # print("mtime_count = 4", file=tfile)
        tfile.close()
        with open("/tmp/ini2", "w") as tfile:
            print("[DEFAULT]", file=tfile)
            print("retries = 3", file=tfile)
            print("volume_scan_hours_limit = 1", file=tfile)
            print("scan_history = 1601", file=tfile)
            print("mtime_count = 5", file=tfile)
            print("parallel_per_agent = 3", file=tfile)
            print("log_level = WARN", file=tfile)
            print("", file=tfile)
            print("[vol1]", file=tfile)
            print("num_workers=4", file=tfile)
            print("excludes=this/that,other", file=tfile)
            print("extra_scan_args=--allow-empty", file=tfile)
        tfile.close()
        with open("/tmp/ini3", "w") as tfile:
            print("[home]", file=tfile)
            print("num_workers=4", file=tfile)
            print("excludes=jake,dpreuss,dhughes,doug stuff,jumbo,shrimp", file=tfile)
        tfile.close()

    def test_exponential_backoff_for_failing_requests(self):
        """Test exponential backoff during failing requests for starfish core.
        We used to create an avalanche of requests when the starfish server was down because they
        were being produced one by one without any delay.
        After the change, we are using exponential backoff to not create a flood of requests.
        In our testing when the local starfish server dogfood was down it led to 50k -100k requests in 2 minutes.
        Now the amount of Retries for failing  requests is dramatically smaller."""

        username = "not_working_username"
        secret = "not_working_secret"
        host = "https://not_working_host.invalid"

        # create custom strategy for retry to be able to return from "while True" loop. Note only two yields.
        def test_request_retry_strategy():
            yield True
            yield True
            yield False

        test_api = SfApi(host, username, secret, debug_level=1, request_retry_strategy=test_request_retry_strategy)
        initial_time = time.time()
        with self.assertRaises(Exception):
            test_api.get_volumes()
        time_needed_for_return_value = time.time() - initial_time
        # expected requests processing time is equal to 5 + 5 + 1 (every next request try is incremented by 1)
        basic_starting_timeout = 5
        timeout_increment_for_failed_request = 1
        expected_time = basic_starting_timeout + (basic_starting_timeout + timeout_increment_for_failed_request)
        assert time_needed_for_return_value > expected_time

    def test_default_cliargs(self):
        """test .ini file options for simple ini with nothing in it"""
        print("test empty .ini")
        cfg_, args_ = read_args(cfgfile="/tmp/ini1")

        self.assertEqual(args_.parallel, 2)
        self.assertEqual(args_.mtime_count, 6)

        retries = cfg_.get(volume=None, param="retries")
        self.assertEqual(retries, None)

        retries = cfg_.get(volume=None, param="volume_scan_hours_limit")
        self.assertEqual(retries, None)

        retries = cfg_.get(volume=None, param="scan_history")
        self.assertEqual(retries, None)

    def test_extra_scan_args(self):
        """test presence and functionality of extra_scan_args"""
        print("test extra_scan_args")
        cfg_, _ = read_args(cfgfile="/tmp/ini2")

        extra_scan_args = cfg_.get(volume=None, param="extra_scan_args")
        self.assertEqual(extra_scan_args, None)
        extra_scan_args = cfg_.get(volume="vol1", param="extra_scan_args")
        self.assertEqual(extra_scan_args, "--allow-empty")

    def test_typical_ini(self):
        """test .ini file options for simple ini with alternate defaults in it"""
        print("test typical .ini")
        cfg_, args_ = read_args(cfgfile="/tmp/ini2")

        self.assertEqual(args_.parallel, 3)
        self.assertEqual(args_.mtime_count, 5)

        retries = cfg_.get(volume=None, param="retries")
        self.assertEqual(retries, "3")

        retries = cfg_.get(volume=None, param="volume_scan_hours_limit")
        self.assertEqual(retries, "1")

        retries = cfg_.get(volume=None, param="scan_history")
        self.assertEqual(retries, "1601")

    def test_cli_override(self):
        """test .ini with cli override"""
        print("test typical .ini with cli override")

        testargs = ["dispatchscan.py", "--parallel", "8"]
        with unittest.mock.patch("sys.argv", testargs):
            args_ = read_args(cfgfile="/tmp/ini2")[1]
            # confirm that cli arg overrides ini2 value
            self.assertEqual(args_.parallel, 8)

        testargs = ["dispatchscan.py", "--mtime_count", "33"]
        with unittest.mock.patch("sys.argv", testargs):
            args_ = read_args(cfgfile="/tmp/ini2")[1]
            # confirm that cli arg overrides ini2 value
            self.assertEqual(args_.mtime_count, 33)

    def test_log_level(self):
        """test using log level config and cli option"""

        cfg_, args_ = read_args(cfgfile="/tmp/ini1")
        self.assertEqual(cfg_.get(volume=None, param="log_level"), None)
        self.assertEqual(args_.log_level, "INFO")

        testargs = ["dispatchscan.py", "--log-level", "WARN"]
        with unittest.mock.patch("sys.argv", testargs):
            cfg_, args_ = read_args(cfgfile="/tmp/ini1")
            # confirm that cli arg overrides
            self.assertEqual(cfg_.get(volume=None, param="log_level"), None)
            self.assertEqual(args_.log_level, "WARN")

        cfg_, args_ = read_args(cfgfile="/tmp/ini2")
        self.assertEqual(cfg_.get(volume=None, param="log_level"), "WARN")
        self.assertEqual(args_.log_level, "WARN")

        testargs = ["dispatchscan.py", "--log-level", "ERROR"]
        with unittest.mock.patch("sys.argv", testargs):
            cfg_, args_ = read_args(cfgfile="/tmp/ini2")
            # confirm that cli arg overrides
            self.assertEqual(cfg_.get(volume=None, param="log_level"), "WARN")
            self.assertEqual(args_.log_level, "ERROR")

    def test_workers(self):
        """test workers configuration"""
        cfg_ = read_args(cfgfile="/tmp/ini2")[0]
        self.assertEqual(cfg_.get_workers("foo"), 16)
        self.assertNotEqual(cfg_.get_workers("vol1"), 16)
        self.assertEqual(cfg_.get_workers("vol1"), 4)

        cfg_ = read_args(cfgfile="/tmp/ini1")[0]
        self.assertEqual(cfg_.get_workers("foo"), 12)

    def test_excludes(self):
        """test excludes handling"""
        cfg_ = read_args(cfgfile="/tmp/ini2")[0]
        # no volume foo, thus no exceptions
        # print("excludes = " + cfg_.get_vol_excludes('vol1'))
        self.assertEqual(cfg_.get_vol_excludes("foo"), None)
        self.assertEqual(cfg_.get_vol_excludes("vol1"), {"ignore_dirs": ["this/that", "other"]})

        cfg_ = read_args(cfgfile="/tmp/ini1")[0]
        self.assertEqual(cfg_.get_vol_excludes("vol1"), {"ignore_dirs": ["excludes-from-defaults"]})

    def test_parallel_construct(self):
        """test volume parallel scans by sub-volume
        test works on 192.168.10.157 with home: volume"""
        global cfg  # pylint: disable=W0603
        cfg = read_args(cfgfile="/tmp/ini3")[0]
        agents_ = Agents()
        # disable warning about protected member in test
        pathlist, exceptlist = agents_._get_parallel_pathlist("home")  # pylint: disable=W0212
        self.assertCountEqual(
            pathlist,
            [
                "home:.cache",
                "home:.emptydir",
                "home:bob",
                "home:doug",
                "home:ebressler",
                "home:frog morten",
                "home:jim",
                "home:pgalvin",
                "home:sesh",
                "home:starfish",
                "home:dave",
                "home:.test",
            ],
        )
        self.assertEqual(exceptlist, "--ignore-dir 'home:jumbo' --ignore-dir 'home:shrimp'")


##################################################################################
# Global functions
##################################################################################


def done_signal(signum, frame):  # pylint: disable=W0613
    """handle signal exit"""
    logger.warning("Received signal %d. Exiting ", signum)
    sys.exit(0)


def done():
    """log done on exit"""
    logger.warning("Dispatchscan exiting")


def reset_failed_volumes(signum, frame):  # pylint: disable=W0613
    """log failed volume to log file
    :returns: Nothing"""
    log_failed_volumes()
    try:
        agents.reset_failed()
    except NameError:
        pass
    logger.warning("Resetting all failed volumes to normal state.")


def usr2_handler(signum, frame):  # pylint: disable=W0613
    """handle SIGUSR2 - dump config to log"""
    signal.siginterrupt(signum, False)  # signal_checker:ignore
    logger.info("Dumping Config")
    try:
        cfg.dump()
    except NameError:
        pass


def usr1_handler(signum, frame):  # pylint: disable=W0613
    """handle SIGUSR1"""
    signal.siginterrupt(signum, False)  # signal_checker:ignore
    log_failed_volumes()
    logger.info("------")
    log_pending_scans()


def log_failed_volumes():
    """log failed volume to log file
    :returns: Nothing"""

    failed_volumes = agents.get_failed()
    if failed_volumes:
        liststr = ", ".join(failed_volumes)
        if args.debug:
            print("logging failed volumes: ")
            liststr = ", ".join(failed_volumes)
            print(liststr)

        logger.warning("The following volumes are marked as failed: %s", liststr)
    else:
        logger.info("No failed volumes")


def log_pending_scans():
    """print out the list of pending scans in order by agent
    :returns: Nothing"""

    agentlist_ = agents.get_vols()[0]
    logger.info("Queued active scan order by agent:")
    for agent_ in agentlist_:
        # get list of all volumes for agent, ordered by oldest, reduced by active and failed and recent
        ordered_vols_to_scan_ = agents.vols_by_lastscan(agent_)
        logger.info("%s: %s", agent_, ", ".join(ordered_vols_to_scan_))


def save_tmp_file(alist: list) -> str:
    """take an input list and write it to a temp file for later use
    :returns: the temp file name"""

    fp = tempfile.NamedTemporaryFile(mode="w", delete=False)
    try:
        fp.write("\n".join(alist))
        fp.write("\n")
    except OSError as exc:
        logger.error("error writing temporary file with list for parallel execution: %s", str(exc))
        raise RuntimeError("saving temporary list")

    return fp.name


def read_args(**kwargs):
    """read command line arguments and .ini file arguments
    :returns: cfg instance and args instance"""

    if "cfgfile" in kwargs:
        cfg_ = Config(cfgfile=kwargs["cfgfile"])
    else:
        cfg_ = Config()

    # check cli .ini values and use defaults if not
    parallel = cfg_.get(volume=None, param="parallel_per_agent")
    if not parallel:
        parallel = 2
    debug = cfg_.get(volume=None, param="debug")
    if not debug:
        debug = 0
    mtime_cfg = cfg_.get(volume=None, param="mtime_count")
    if not mtime_cfg:
        mtime_count = 6
    else:
        mtime_count = int(mtime_cfg)
    log_level = cfg_.get(volume=None, param="log_level")
    if not log_level:
        log_level = "INFO"

    parser = argparse.ArgumentParser(
        description=f"balance diff scans across agents in parallel. \
            Loop over volumes and dispatch them <n default=2> at a time on agent machines. \
            Example dispatchscan.ini is located at https://starfishdownloads.s3.amazonaws.com/tools/dispatchscan.ini; \
            this should be put in {SFHOME}/etc. \
            See this file for additional options."
    )
    parser.add_argument(
        "--debug",
        "-d",
        required=False,
        default=debug,
        type=int,
        help="enable debugging",
    )
    parser.add_argument(
        "--parallel",
        required=False,
        default=parallel,
        type=int,
        help="number of simultaneous diff scans per agent (default 2)",
    )
    parser.add_argument(
        "--show-agents",
        required=False,
        action="store_true",
        default=False,
        help="show agent and volume mappings and exit.",
    )
    parser.add_argument(
        "--log-level",
        required=False,
        default=log_level,
        help="set loglevel for output to log (default: INFO)",
    )
    parser.add_argument(
        "--mtime_count",
        required=False,
        type=int,
        default=mtime_count,
        help="number of mtime scans to run between diff scans (default: 6)",
    )
    parser.add_argument("--test", required=False, action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--version", required=False, action="store_true", help="show version and exit")

    args_ = parser.parse_args()

    return cfg_, args_


##################################################################################
# main prog
##################################################################################


if __name__ == "__main__":
    signal.signal(signal.SIGUSR1, usr1_handler)  # signal_checker:ignore
    signal.siginterrupt(signal.SIGUSR1, False)  # signal_checker:ignore
    signal.signal(signal.SIGUSR2, usr2_handler)  # signal_checker:ignore
    signal.siginterrupt(signal.SIGUSR2, False)  # signal_checker:ignore
    signal.signal(signal.SIGHUP, reset_failed_volumes)  # signal_checker:ignore
    signal.siginterrupt(signal.SIGHUP, False)  # signal_checker:ignore
    signal.signal(signal.SIGINT, done_signal)  # signal_checker:ignore
    signal.signal(signal.SIGTERM, done_signal)  # signal_checker:ignore
    atexit.register(done)
    cfg, args = read_args()
    agents = Agents()
    if args.version:
        print("dispatchscan.py version " + version)
        sys.exit(0)
    if args.test:
        # reset config
        del cfg
        TestDispatch.cmdargs = args
        unittest.main(argv=["first-arg-is-ignored"], exit=True)
        sys.exit(0)
    startup_delay = cfg.get(volume=None, param="startup_delay")
    if startup_delay:
        if args.debug:
            print(f"Waiting for Starfish Services to start. Sleeping {startup_delay} seconds")
        time.sleep(int(startup_delay))
    agentlist, vols = agents.get_vols()
    if args.show_agents:
        pprint(agentlist, indent=4)
        sys.exit(0)
    logfile = f"{SFHOME}/log/dispatch_scan.log"
    if not os.path.exists(logfile):
        try:
            lf = open(logfile, "w")
            lf.close()
        except OSError as e:
            print("Error: cannot create log file. exiting.")
            print(str(e))
            sys.exit(1)
    if not os.access(logfile, os.W_OK):
        print("fatal: logfile is not writable; Check permissions for " + logfile)
        sys.exit(1)
    log_handler = logging.handlers.WatchedFileHandler(logfile)
    formatter = logging.Formatter(
        fmt="%(levelname).3s %(asctime)s %(message)s [%(filename)s:%(lineno)d]", datefmt="%Y-%m-%d %H:%M:%S,000"
    )

    log_handler.setFormatter(formatter)
    logger.setLevel(args.log_level)
    logger.addHandler(log_handler)
    logger.propagate = False
    logger.info("Dispatchscan starting")
    logger.info(f"Dispatchscan config:\n{json.dumps(cfg.as_dict(), indent=2)}")
    # verify scan history is sufficient to cover volumes under management
    if agents.scan_history < len(vols) * 2:
        agents.scan_history = len(vols) * 2
        logger.warning(
            "scan history is too small. Adjusting it to 2x of number of vols = %d",
            agents.scan_history,
        )

    sleeptime = cfg.get_int(volume=None, param="sleep_between_executions", default=60)
    sleeptime = max(sleeptime, 5)  # minimal sleeptime is 5 seconds
    logger.info(f"Sleep time is set to {sleeptime}")

    # spawn per-agent subprocesses and keep them going
    while True:
        # don't need the returns, will use the methods
        agents.get_vols()
        agents.active_scans()
        # update last scan times for volumes
        agents.get_scan_history()
        for agent in agentlist:
            # get list of all volumes for agent, ordered by oldest, reduced by active and failed and recent
            ordered_vols_to_scan = agents.vols_by_lastscan(agent)
            available_count = len(ordered_vols_to_scan)
            if args.debug:
                print(f"available scans for agent {agent} ", end="")
                print(ordered_vols_to_scan)
            # There may be no scans running at initial start
            try:
                scanning_count = len(agents.active_scans_by_agent[agent])
            except (KeyError, TypeError):
                scanning_count = 0
                if args.debug > 1:
                    print(f" agent {agent} not in agents.active_scans_by_agent")

            if args.debug > 1:
                print(f"  {agent}: running scans {scanning_count} of {args.parallel}")

            while available_count > 0 and scanning_count < args.parallel:
                if args.debug > 3:
                    print(f"    available_count: {available_count}, scanning_count: {scanning_count}")
                # in complete failure (agent) scenario, ordered_vols_to_scan will be empty
                if ordered_vols_to_scan:
                    scanvol = ordered_vols_to_scan.pop()
                else:
                    # none to scan
                    if args.debug:
                        print(f"  {agent}: no vols scan")
                    break
                if args.debug > 0:
                    print(f"    starting scan on volume \033[1m{scanvol}\033[0m, agent {agent}")
                if Agents.vol_is_parallel(scanvol):
                    if args.debug > 0:
                        print(f"    running parallel scan of {scanvol}")
                    logger.info(" starting parallel scan of %s", scanvol)
                    agents.parallel_scan(scanvol)
                    logger.info(" finishing parallel scan of %s", scanvol)
                else:
                    if agents.retry_scans(scanvol):
                        scanning_count += 1
                        available_count -= 1

            # end of agent loop here
            logger.info(f"Done scan loop for {agent}")
            if args.debug > 0:
                print(f"{agent}: Done scan loop for agent. Started {scanning_count} scans")

        logger.info("Done looping over all agents")
        if args.debug > 0:
            print("Done looping over all agents.")
            print("-------------------------------------------")

        # end for all agents loop
        if args.debug:
            time.sleep(5)
        else:
            if args.debug:
                print(f"sleeping {sleeptime} seconds between agent loops")
            time.sleep(sleeptime)
