"""
***********************************************************************************************************

 Starfish Storage Corporation ("Starfish") CONFIDENTIAL
 Unpublished Copyright (c) 2011-2021 Starfish Storage Corporation, All Rights Reserved.

 NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
 Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
 intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
 U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
 Dissemination of this information or reproduction of this material is strictly forbidden unless prior
 written permission is obtained from Starfish. Access to the source code contained herein is hereby
 forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
 confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
 Starfish's software.

 ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
 THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
 AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
 FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
 DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
 WHOLE OR IN PART.

 FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
   These notices shall be marked on any reproduction of this data, in whole or in part.
   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
   Software clause at DFARS 52.227-7013.

***********************************************************************************************************
"""
# pylint: disable=redefined-builtin

############################################################################################
#
# last modified 2023-09-25 - Doug H
# 1.1  add async query, rearrange the entire thing into alike sections and fix params definition for manifest
# 1.05 add methods to fetch bucket mapping and credentials for auth, previously
#      in aws.py
# 1.04 updated to allow passing a query filter through to archive commands
# 1.03 fixed issue with sending down dedupe and tar options when they are not used
# 1.02 fixed typo of a parameter name for run_archive variable name
# 1.01 fixed API for scanning where a typo was causing it to always scan from the root
#      of a volume
# 1.0: updated to have a version number which can be used by other scripts to check that
#      the minimum version needed is being used for that script. (See zonetag_archive.py
#      for example)

import configparser
import email.utils
import json
import logging
import math
import os
import platform
import re
import smtplib
import socket
from email import encoders
from email.charset import QP, Charset
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from pprint import pformat
from urllib.parse import quote, urlencode, urlparse

import requests
import urllib3
from requests.auth import HTTPBasicAuth

VERSION = 1.1

sfc_logger = logging.getLogger(__name__)


class S3ArchiveTarget(dict):
    """S3 Archive Target Object"""

    def __init__(
        self,
        bucket_name,
        aws_access_key_id,
        aws_secret_access_key,
        endpoint_url=None,
        dest_path=None,
        invalid_chars=None,
        aws_session_token=None,
        store_md5_in_metadata=True,
        is_etag_equal_to_md5=True,
        verify_content_checksum=False,
        timeout=None,
        retries=7,
        region_name=None,
        signature_version=None,
        use_ssl=True,
        verify_ssl=True,
        single_read_on_s3_upload=False,
        storage_class="STANDARD",
    ):
        super().__init__()
        self["bucket_name"] = bucket_name
        self["aws_access_key_id"] = aws_access_key_id
        self["aws_secret_access_key"] = aws_secret_access_key
        self["store_md5_in_metadata"] = store_md5_in_metadata
        self["is_etag_equal_to_md5"] = is_etag_equal_to_md5
        self["verify_content_checksum"] = verify_content_checksum
        self["retries"] = retries
        self["use_ssl"] = use_ssl
        self["verify_ssl"] = verify_ssl
        self["single_read_on_s3_upload"] = single_read_on_s3_upload
        self["storage_class"] = storage_class

        if endpoint_url:
            self["endpoint_url"] = endpoint_url
        if dest_path:
            self["dest_path"] = dest_path
        if invalid_chars:
            self["invalid_chars"] = invalid_chars
        if aws_session_token:
            self["aws_session_token"] = aws_session_token
        if timeout:
            self["timeout"] = timeout
        if region_name:
            self["region_name"] = region_name
        if signature_version:
            self["signature_version"] = signature_version

    def sf_type(self):
        return "s3"


class SFMail:
    """the necessary parts to relay email through localhost and format outbound email"""

    def __init__(self, mailhost="localhost", port=25, **kwargs):
        self.message = MIMEMultipart()
        self.mailhost = mailhost
        self.port = port
        self.headers = {"Date": email.utils.formatdate(localtime=True)}

        if "From" in kwargs and kwargs["From"] is not None:
            self.headers["From"] = kwargs["From"]
        else:
            user = os.getlogin()
            host = socket.getfqdn()
            self.headers["From"] = f"{user}@{host}"
        if "To" in kwargs:
            self.headers["To"] = kwargs["To"]
        if "Subject" in kwargs:
            self.headers["Subject"] = kwargs["Subject"]

    def send_smtp(self):
        """Open an SMTP connection to relay running on local host
        :return: True|False"""
        # maybe use smtps in future
        # Log in to server using secure context and send email
        # context = ssl.create_default_context()
        if "To" not in self.headers:
            sfc_logger.error("Error: unsent email contains no To:")
            return False
        if "From" not in self.headers:
            sfc_logger.error("Error: unsent email contains no From:")
            return False
        if "Subject" not in self.headers:
            sfc_logger.error("Error: unsent email contains no Subject:")
            return False

        for hdr in self.headers:
            self.message[hdr] = self.headers[hdr]

        try:
            with smtplib.SMTP(self.mailhost, self.port) as server:
                # server.login(sender_email, password)
                server.sendmail(
                    self.headers["From"],
                    self.headers["To"].split(","),
                    self.message.as_string(),
                )
            return True
        except Exception as e:
            sfc_logger.error(f"Error: email send failure: {e}")
            return False

    def attach(self, filename, mimetype="application/octet-stream", stream=None):
        """add an attachment to the email, application/octet-stream
        :return: Nothing"""

        first, second = mimetype.split("/")

        if stream:
            part = MIMEBase(first, second)
            part.set_payload(stream)
        else:
            # Open file in binary mode
            with open(filename, "rb") as attachment:
                # Add file as e.g. application/octet-stream
                # Email client can usually download this automatically as attachment
                part = MIMEBase(first, second)
                part.set_payload(attachment.read())

        # Encode file in ASCII characters to send by email
        encoders.encode_base64(part)

        # Add header as key/value pair to attachment part
        part.add_header(
            "Content-Disposition",
            f"attachment; filename= {filename}",
        )
        self.message.attach(part)

    def set_body(self, bodytext, msgtype="plain"):
        """set the message body text"""
        # Use Quoted Printable for body (default is binhex64)
        charset = Charset("utf-8")
        charset.header_encoding = QP
        charset.body_encoding = QP

        self.message.attach(MIMEText(bodytext, msgtype, _charset=charset))

    def set_header(self, header, value):
        """set a message header property"""
        self.headers[header] = value

    def set_headers(self, **kwargs):
        """set multiple headers at once"""
        for hkey, hvalue in kwargs.items():
            self.set_header(hkey, hvalue)

    def get_header(self, header):
        """get a message header property
        :return: a header property"""

        if "header" in self.headers:
            return self.headers[header]
        else:
            return None

    def delete_header(self, header):
        """delete a header property

        :param header: the property to delete
        :type header: string
        """
        del self.headers[header]


class SFCommon:
    """Common modules to be used by multiple classes with Starfish"""

    def __init__(self):
        self.vol_to_mount = {}
        self.vol_to_agent = {}
        self.mount_to_vol = {}
        self._config_ = {}
        self._zones_ = None
        self._cached_sorted_mounts_ = None
        self._buckets_ = {}

        # disable warnings
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        # parse configs and prepare rest auth strings
        config = configparser.ConfigParser(strict=False)

        if platform.system() == "Windows":
            sfhome = os.getenv("%SFHOME%", "C:\\Program Files\\starfish")
        else:
            sfhome = os.getenv("SFHOME", "/opt/starfish")
        config.read(os.path.join(sfhome, "etc", "01-service.ini"))
        passwd = config.get("global", "secret_key")
        self._config_["apihost"] = self.get_apihost(config)

        self._config_["session"] = requests.Session()
        self._config_["session"].auth = HTTPBasicAuth("starfish", passwd)

        self._core_config = self.get_core_config(config)
        self._config_["starfish_url_prefix"] = self._core_config.get("starfish_url_prefix", "https://localhost")

        """ get volume list, and collect a set of mappings from volume to mount point and inverse
        also setup a starfish session handle with shared key in config. requires root/config read"""
        vl = self.fetch_volumes()

        # return vol_to_mount, vol_to_agent
        for ve in vl:
            vol = ve["vol"]
            mountlist = ve["mounts"]
            agentname = list(mountlist)[0]
            m = re.split("[/:]+", agentname)
            agentip = m[1]
            sfc_logger.debug("mountlist: %s", mountlist)
            sfc_logger.debug("agentname: %s", agentname)
            mountpoint = mountlist[agentname]
            sfc_logger.debug("details: {agentname}:{agentip}:{vol}:{mountpoint}")
            self.vol_to_mount[vol] = mountpoint
            self.vol_to_agent[vol] = agentip
            self.mount_to_vol[mountpoint] = vol

        self.vol_to_mount2 = {}
        self.vol_to_agent2 = {}
        self.mount_to_vol2 = {}

    ##################### URL/REST/core  ##############################

    def get_core_config(self, config):
        """get core config service url"""
        config_service_url = config.get("global", "config_service_url")
        return self._perform_get(f"{config_service_url}/api/config")

    def get_apihost(self, config):
        """get the API host entry from the config_service url and return it"""
        config_service_url = config.get("global", "config_service_url")
        sfc_logger.info(f"config_service_url: {config_service_url}")
        url_parsed = urlparse(config_service_url)
        if not url_parsed.hostname or not url_parsed.scheme:
            raise RuntimeError(
                "Can't get hostname and scheme from config_service_url. Won't be able to contact api host for queries."
            )
        return url_parsed.hostname

    # split_s3_path
    @staticmethod
    def split_s3_path(path):
        """split an s3: path into a bucket and a key
        :return: bucket, key"""
        m = re.match("(s3:|https?:)//([^/]+)/(.*)$", path)
        if not m:
            sfc_logger.error("path must match a local path or an S3 path specification")
            return False
        bucket = m.group(2)
        key = m.group(3)
        return (bucket, key)

    # what is it?
    @staticmethod
    def classify_path(path):
        """return what kind of path this thing is that you gave me"""
        if re.match("^(s3:|http)", path):
            return "s3"
        if re.match("([^/:]+):", path):
            return "volpath"
        else:
            return "fqpath"

    # function to return properties
    def get(self, var):
        """an accessor"""
        if var in self._config_:
            return self._config_[var]
        else:
            sfc_logger.debug("get: var {var} not found")
            return None

    def generate_url(self, api_path):
        """will create a root URL with the api path specified
        :return: url string"""
        if api_path[0] == "/":
            api_path = api_path[1:]
        return f"{self.get('starfish_url_prefix')}/api/{api_path}"

    def _perform_get(self, url, txt_format=False):
        """performs a get on the API url specified
        :return text or json payload or false if there is an error"""
        sfc_logger.debug("Performing GET: %s", url)

        sess = self.get("session")
        r = sess.get(url, verify=False)
        if "error" in r or not r.ok:
            if "error" in r:
                sfc_logger.error(f"error in get api {str(r['error'])}")
            else:
                sfc_logger.error(f"error in get api {str(r.reason)}, {r.text} ")
                sfc_logger.error(f"headers: {r.headers}")
        if txt_format:
            return r.text

        try:
            response_json = r.json()
        except json.decoder.JSONDecodeError:
            # didn't recieve response as JSON
            sfc_logger.error(f"call did not return JSON repsonse: {url}")
            raise RuntimeError(f"API URL did not return JSON, probably wrong URL: {url}")
        return response_json

    def _perform_delete(self, url):
        """performs a delete on the API url specified
        :return json payload or false if there is an error"""
        sfc_logger.debug("Performing DELETE: %s", url)

        sess = self.get("session")
        r = sess.delete(url, verify=False)
        if "error" in r or not r.ok:
            if "error" in r:
                sfc_logger.error(f"error in delete api {str(r['error'])}")
            else:
                sfc_logger.error(f"error in delete api {str(r.reason)}, {r.text} ")
                sfc_logger.error(f"headers: {r.headers}")
            return False
        return True

    def _perform_put_or_post(self, url, data, operation, content_type="application/json"):
        if operation not in ["PUT", "POST"]:
            sfc_logger.error("Valid operations are 'PUT' or 'POST'")
            return False

        headers = None
        if content_type:
            headers = {"Content-Type": content_type}

        if data:
            data = json.dumps(data)

        sfc_logger.debug("Performing %s: %s", operation, url)
        sfc_logger.debug("%s data: %s", operation, data)

        sess = self.get("session")

        r = None
        if operation == "PUT":
            r = sess.put(url, data=data, headers=headers, verify=False)
        else:
            r = sess.post(url, data=data, headers=headers, verify=False)

        if "error" in r or not r.ok:
            message = f"error in {operation} via api: {r.reason}, {r.text}"
            sfc_logger.error(message)
            return False

        return r.json()

    def _perform_post(self, url, post_data, content_type="application/json"):
        return self._perform_put_or_post(url, post_data, "POST", content_type)

    def _perform_put(self, url, put_data, content_type="application/json"):
        return self._perform_put_or_post(url, put_data, "PUT", content_type)

    def _clean_dict(self, input_dict):
        """takes a dict and removes anything that is falsy as well as omits 'self'.
        NOTE: if you need an actual false value to be representated here for actual passing
        into another funciton or via json or query string, don't use this as it will omit the whole
        name value pair.
        :param input_dict: the dictionary to clean up
        :type input_dict: dict
        :return: a new dictionary that has all the "false" objects removed and 'self' ignored
        """
        data = {}
        for k, v in input_dict.items():
            if k != "self" and v:
                data[k] = v
        return data

    ##################### Volumes ##############################

    def vol_to_path(self, name):
        """convert a volpath into a filesystem path
        :return: the fully qualified filesystem path for 'name'"""
        try:
            (volume, subpath) = name.split(":", 1)
        except ValueError:
            # might already be a normalized path; or a volune name
            if name in self.vol_to_mount:
                # it could be just a volume name
                return self.vol_to_mount[name]
            else:
                return name

        try:
            retpath = self.vol_to_mount[volume] + "/" + subpath
        except KeyError:
            sfc_logger.error(f"Cannot decode volume {volume}")
            return None
        return retpath

    def get_sorted_mount_paths(self):
        """a caching function to return mounted map paths in reverse sorted order
        this is to fix the potential issue of mounted sub-volumes like /etc and /etc/init.d
        and the order in which they are encountered from the dict keys
        :returns: list of sorted mount paths from volume mount list"""
        if self._cached_sorted_mounts_ is None:
            self._cached_sorted_mounts_ = sorted(self.mount_to_vol.keys(), reverse=True)

        return self._cached_sorted_mounts_

    def path_to_volpath(self, name, case_insensitive=False):
        """convert an OS path to a Starfish volpath
        :return: a volpath for name"""

        if case_insensitive:
            re_flags = re.IGNORECASE
        else:
            # 0 is the no flags options that re functions expect for nothing, e.g. case sensitive which is default
            re_flags = 0

        # it might already be a volpath
        m = re.match("([^/:]+):", name)
        if m:
            # double check it is actually a volume in starfish and not
            # a wonky file name
            sfc_logger.debug("might match %s", m.group(1))
            if m.group(1) in self.vol_to_mount:
                return name

        for path in self.get_sorted_mount_paths():
            path_re = re.escape(path)
            if re.match("^" + path_re, name, flags=re_flags):
                volpath = re.sub("^" + path_re, self.mount_to_vol[path] + ":", name, flags=re_flags)
                return volpath

        return None

    def fetch_volumes(self):
        """:return: a list of all active volumes on this server"""
        url = self.generate_url("/volume/")
        return self._perform_get(url)

    def create_volume(
        self,
        vol_name,
        root_path,
        agent_address=None,
        display_name=None,
        default_agent_address=None,
        dir_excludes=None,
        file_excludes=None,
        store_win_acl=None,
        store_win_attr=None,
        store_posix_acl=False,
        store_xattrs=False,
        store_xattrs_regex=None,
        total_capacity=None,
        capacity_set_manually=False,
        free_space=None,
        free_space_set_manually=False,
    ):
        """creates a new volume
        :param vol_name: New volume name. Updating a volume with some new volume name is possible only when no scan
        and no job is pending on this volume, also redash reports cannot be calculated at that time.
        :param root_path: Path where the volume is mounted on the agent (required when adding new volume)
        :param agent_address: Agent address to be added to the volume (required when adding new volume. By
        default we will look at 01-service.ini for the 'agent' 'service-address' and use that. Put a value here to
        override.
        :param display_name: User familiar name that may contain also characters that are forbidden in vol field and
        should be used only for visualisation.
        :param default_agent_address: The agent that will be used to scan volume when no agent provided in scan request.
        :param dir_excludes: array of directories (glob patterns allowed) to be excluded during scanning
        :param file_excludes: array of filenames (glob patterns are allowed) to be excluded during scanning
        :param store_win_acl: Only applies to Windows volumes - false on Linux volumes. If enabled will store also
        Windows access control lists when scanning this volume. Omit when creating a Linux volume or a failure will
        occurr.
        :param store_win_attr: Only applies to Windows volumes - cannot be set on Linux volumes. If enabled will store
        also Windows file attributes (read-only, hidden, etc.) when scanning this volume. Omit when creating a Linux
        volume or a failure will occurr.
        :param store_posix_acl: Store also POSIX access control lists when scanning this volume. This may be slow.
        :param store_xattrs: Store extended file attributes. This may be slow.
        :param store_xattrs_regex: Regular expression used for filtering extended attributes
        :param total_capacity: Capacity of the volume. This will be ignored if capacity_set_manually is false.
        :param capacity_set_manually: If set to false then total_capacity will be automatically set by agent.
        :param free_space: free space of the volume. This will be ignored if free_space_set_manually if false.
        :param free_space_manually: If set to false then free_space will be automatically set by agent.
        :return: json result
        """
        if dir_excludes is None:
            dir_excludes = []
        if file_excludes is None:
            file_excludes = []
        data = {
            "vol": vol_name,
            "root": root_path,
            "capacity_set_manually": capacity_set_manually,
            "free_space_set_manually": free_space_set_manually,
        }

        if display_name:
            data["display_name"] = display_name
        if total_capacity:
            data["total_capacity"] = total_capacity
        if free_space:
            data["free_space"] = free_space
        if default_agent_address:
            data["default_agent_address"] = default_agent_address
        if dir_excludes:
            data["dir_excludes"] = dir_excludes
        if file_excludes:
            data["file_excludes"] = file_excludes
        if store_win_attr:
            data["store_win_attr"] = store_win_attr
        if store_win_acl:
            data["store_win_acl"] = store_win_acl
        if store_win_acl:
            data["store_posix_acl"] = store_posix_acl
        if store_xattrs:
            data["store_xattrs"] = store_xattrs
        if store_xattrs_regex:
            data["store_xattrs_regex"] = store_xattrs_regex

        if agent_address:
            data["agent_address"] = agent_address

        url = self.generate_url("/volume/")
        return self._perform_put(url, data)

    def delete_volume(self, vol_name, remove_reports=False):
        """deletes a volume by name
        :return: True or False
        """
        url = self.generate_url(f"volume/{vol_name}")
        if remove_reports:
            url = f"{url}?remove_reports=True"

        return self._perform_delete(url)

    ##################### Jobs ##############################

    def get_jobs(self, limit=200, sort_by="created_at+-name", **kwargs):
        """gets a list of jobs
        :param num_id: get entry by job id
        :type num_id: int, optional
        :param sort_by: how to sort results, default created_at
        :type sort_by: string, optional
        :param limit: limits the number of return results per page, default 200
        :type limit: int, optional
        :param paging_offset: Parameter that describes paging offset. It should be equal to number of entries that have
        been already printed on the previous pages.
        :type paging_offset: int, optional
        :param running: if set to true, lists only running scans; if set to false, lists only not-running (completed)
        scans; cannot be used together with state
        :type running: boolean, optional
        :param volume: name of the volume you wish to restrict the return to
        :type volume: string, optional
        :param root_path: where the job started (default to top of volume)
        :type root_path: string, optional
        :param status: job state(s), either a list of statuses or a single status as a string. Cannot be used together
        with running.
        :type status: list, optional
        :param created_at: Supports FROM-TO and RELATIVE formats;
        :type created_at: date field as string, optional
        :param ended_at: same as created_at
        :type created_at: date field as string, optional
        :return: body of result
        """
        locals().update(kwargs)
        del kwargs
        query_params = self._clean_dict(locals())  # grab the variables now so that we don't end up with extras later
        url = self.generate_url("/job/")

        url = f'{url}?{urlencode(query_params, safe="+-")}'
        return self._perform_get(url)

    def show_job(self, jobid):
        """equivalent to sf show job, return json
        :return: json"""

        url = self.generate_url(f"/job/{jobid}")
        return self._perform_get(url)

    def fetch_starfish_job_md(self, volpath, **kwargs):
        """given a volpath optional job name, fetch metadata
        :return: job information in json format"""
        jobname = kwargs.get("jobname", default="s3-metadata")

        # verify it's in the right format
        vp = self.path_to_volpath(volpath)
        vol = quote(vp, safe="")

        prefix = self.generate_url(f"/query/{vol}/?")
        qstr = b"jobs.%s:exists" % bytes(jobname, "UTF-8")
        exesuffix = "query=" + quote(qstr, safe="&")
        exesuffix += quote(f"&format=jobs.{jobname}&limit=1", safe="=&")
        exestr = prefix + exesuffix

        js = self._perform_get(exestr)
        if js:
            return js[0]["jobs"][jobname]
        else:
            return False

    ##################### Query  ##############################

    def fetch_starfish_versions(self, file):
        """given a file, get all of the versions of that file from Starfish
        :return:  json jobs result"""

        volpath = self.path_to_volpath(file)
        if volpath is None:
            return None

        url = self.generate_url(f'/query/{quote(volpath, safe="-_")}')
        qstr = "/?query=search-all&format=jobs%20username"
        exestr = url + qstr
        js = self._perform_get(exestr)

        if sfc_logger.isEnabledFor(logging.DEBUG):
            sfc_logger.debug("response: %s", pformat(js, indent=2))
        return js

    def get_async_query_status(self, query_id):
        """Retrives the status of an async query by the query_id
        :return: json structure
        """
        url = self.generate_url(f"/async/query/{query_id}")
        return self._perform_get(url)

    def get_async_query_result(self, query_id):
        """Retrives the result of an async query by the query_id
        :return: body of result
        """
        url = self.generate_url(f"/v1/async/query_result/{requests.utils.quote(query_id)}")
        return self._perform_get(url, True)

    def delete_async_query_result(self, query_id):
        """Deletes the result of an async query by the query_id
        :return: body of result
        """
        url = self.generate_url(f"/v1/async/query_result/{requests.utils.quote(query_id)}")
        return self._perform_delete(url)

    def async_query(self, volpath, querystr, **kwargs):
        """Starts the async process to create a background query .
        :param volpath: a starfish starting place for the query
        :param querystr: query string
        :param kwargs: query variable list of additional arguments
        :return: json structure
        """
        append = ""
        if "format" in kwargs:
            append += f'&format={quote(kwargs["format"], safe="=+&")}'
        if "limit" in kwargs:
            append += f'&limit={quote(kwargs["limit"], safe="=")}'
        if "group_by" in kwargs:
            append += f'&group_by={quote(kwargs["group_by"], safe="=")}'
        if "print_headers" in kwargs:
            append += "&print_headers=true"
        else:
            append += "&print_headers=false"
        if "output_format" in kwargs:
            append += f'&output_format={kwargs["output_format"]}'
        else:
            append += "&output_format=json"

        # all volumes special case
        if volpath == quote("*"):
            prefix = "/async/query/?"
        else:
            prefix = f"/async/query/?volumes_and_paths={quote(volpath)}"

        query = prefix + f"&queries={querystr}" + append
        url = self.generate_url(query)
        sfc_logger.debug("querystrg: %s", url)
        return self._perform_post(url, None)

    def query(self, volpath, querystr, **kwargs):
        """given a query string return the data
        :return: json query results"""

        # build up query bit by bit
        append = ""
        if "exts" in kwargs:
            # add them in
            if kwargs["exts"] != "":
                for e in kwargs["exts"].split(" "):
                    append += quote(f" ext={e}", safe="=")

        if "format" in kwargs:
            append += f'&format={quote(kwargs["format"], safe="=+&")}'
        if "limit" in kwargs:
            append += f'&limit={quote(kwargs["limit"], safe="=")}'
        if "group_by" in kwargs:
            append += f'&group_by={quote(kwargs["group_by"], safe="=")}'

        # verify it's in the right format
        qstr = quote(querystr, safe="")

        # all volumes special case
        if volpath == quote("*"):
            prefix = self.generate_url("/query/?")
        else:
            prefix = self.generate_url(f"query/{volpath}/?")
        # qstr = b'jobs.%s:exists' % bytes(jobname, 'UTF-8')
        # qstr = bytes(qstr, 'UTF-8')
        exesuffix = "query=" + qstr
        exestr = prefix + exesuffix + append

        return self._perform_get(exestr)

    ##################### Zones ##############################

    def create_zone(
        self,
        name,
        managers,
        managing_groups,
        paths,
        cost_per_gb=None,
        purpose=None,
        location=None,
    ):
        """creates a new zone
        :param name: name of the zone to create.
        :param managers: an array of SFZoneManager objects
        :param managing_groups: an array of SFZoneManagingGroup objects
        :params paths: an array of paths in format 'volume:path', eg. projects:dir1/dir2
        :params cost_per_gb: decimal value of the cost per gb
        :params purpose: string describing what the zone is for
        :params location: string for the location of the zone
        :return: body of result
        """
        url = self.generate_url("/zone/")
        post_data = {
            "name": name,
            "managers": managers,
            "managing_groups": managing_groups,
            "paths": paths,
        }

        user_params = {}
        if cost_per_gb:
            user_params["cost_per_gb"] = cost_per_gb
        if purpose:
            user_params["purpose"] = purpose
        if location:
            user_params["location"] = location

        if user_params:
            post_data["user_params"] = user_params

        return self._perform_post(url, post_data)

    def delete_zone(self, id, force=False):
        """deletes a zone
        :param id: the id of the zone to delete
        :param force: Remove zone even if it's used by some namespace (Default: False)
        :return: body of result
        """
        url = self.generate_url(f"/zone/{id}")
        if force:
            url = f"{url}/?force=true"
        return self._perform_delete(url)

    def create_namespace(self, name, zone_id, action, inheritable=True, pinable=False):
        """create a new namespace
        :param name: the name of the namespace
        :type name: string
        :param zone_id: the zone id of the zone you want to assicaite the namespace with
        :type zone_id: int
        :param action: MOVE, COPY, DELETE
        :type action: string
        :param inheritable: Should tags in this namespace be displayed alongside items (files, directories) in tagged
        subtree?, defaults to True
        :type inheritable: bool, optional
        :param pinable: Should tags in this namespace be pinned when archiving, defaults to False
        :type pinable: bool, optional
        :return: false or json contents
        """
        data = self._clean_dict(locals())  # grab the variables now so that we don't end up with extras later
        url = self.generate_url("/namespace/")

        return self._perform_post(url, data)

    def delete_namespace(self, namespace_name, force=False):
        """deletes a namespace
        :param namespace_name: the name of the namespace to delete
        :return: true or false
        """
        url = self.generate_url(f"/namespace/{namespace_name}")
        return self._perform_delete(url)

    def get_namespace(self, namespace):
        """fetch properties of a namespace and zones it maps to
        :return: json structure or False"""
        url = self.generate_url(f"/namespace/{namespace}")
        return self._perform_get(url)

    def get_zone(self, zone):
        """get properties of a single zone
        :return: json properties of a single zone (or False)"""

        zone_id = self.zone_get_id(zone)

        url = self.generate_url(f"/zone/{zone_id}")
        return self._perform_get(url)

    def get_zones_all(self):
        """:return: entire zones json"""
        url = self.generate_url("/zone/")
        return self._perform_get(url)

    def get_zones_map(self):
        """create a reversible mapping of all zones to zoneid
        :return: dict or False"""

        # shortcut
        if self._zones_ is not None:
            return self._zones_

        self._zones_ = {}
        zones = self.get_zones_all()
        if not zones:
            return False
        for z in zones:
            self._zones_[z["id"]] = z["name"]
            self._zones_[z["name"]] = z["id"]

        return self._zones_

    def zone_swap_id(self, zname):
        """given a zone name return or id, return its corrolary
        :return: zone id or name or False"""

        zones = self.get_zones_map()

        if zname in zones:
            return zones[zname]
        else:
            return False

    def zone_get_id(self, zone):
        """take an input that might be a name or an id and turn it into a zoneid
        :return: zoneid or False"""
        try:
            zone_id = int(zone)
            if self.zone_swap_id(zone_id):
                return zone_id
            else:
                return False
        except ValueError:
            return self.zone_swap_id(zone)

    def zone_set_user_param(self, zone, param, value):
        """set a zone user param using api
        :return: True or False"""

        zone_id = self.zone_get_id(zone)
        url = self.generate_url(f"/zone/{zone_id}/user_params/{param}")

        return bool(self._perform_put(url, json.dumps(value)))

    def zone_get_user_params(self, zone, param="ALL"):
        """get a zone user param using api, zone could be a name or an id
        :return: param value or None (param value may be valid empty string)"""

        zone_id = self.zone_get_id(zone)

        url = self.generate_url(f"/zone/{zone_id}")
        js = self._perform_get(url)

        if sfc_logger.isEnabledFor(logging.DEBUG):
            sfc_logger.debug("response: %s", pformat(js))

        # hould have valid js (else error)
        if not js:
            return None

        user_params = js["user_params"]
        # shortcut
        if param == "ALL":
            return user_params

        sfc_logger.debug("checking %s in %s", param, user_params)

        if param in user_params:
            sfc_logger.debug("returning %s", user_params[param])
            return user_params[param]
        else:
            return None

    ##################### Tags  ##############################

    def add_tags(self, file_set, tag_set):
        """given a filelist, individual item or set and a taglist (or set)
        use the Starfish API bulk_tag operation to add the tags to the files in one operation
        :return: true or false"""

        if isinstance(file_set, set):
            file_list = list(file_set)
        elif isinstance(file_set, str):
            file_list = [file_set]
        else:
            file_list = file_set

        if isinstance(tag_set, set):
            tag_list = list(tag_set)
        elif isinstance(tag_set, str):
            tag_list = [tag_set]
        else:
            tag_list = tag_set

        send_js = {}
        send_js["paths"] = file_list
        send_js["tags"] = tag_list
        send_js["strict"] = False

        url = self.generate_url("/tag/bulk/")

        result = self._perform_post(url, send_js, content_type="application/vnd.sf.tag.bulk+json")
        sfc_logger.debug("add tags reault: %s", result)
        return bool(result)

    def get_tags(self, in_namespace="", with_namespace=False, with_private=True):
        """List all the available tags.
        :return: json structure
        """

        prefix = f"/tag/?with_namespace={with_namespace}&in_namespace={in_namespace}&with_private={with_private}"
        url = self.generate_url(prefix)
        return self._perform_get(url)

    def purge_tag(self, tag, volume=None):
        """Purge given tag from whole volume
        If volume name is not given, or is null, tag is removed from entries on all volumes. Also, if volume name is not
        given, or is null, tag will no longer be available unless reintroduced. If tag name is not given, or is null,
        all tags in a given volume are removed.

        :param tag: tag name in format namespace:tag
        :type tag: string
        :return: json structure or false
        """

        data = {"tag": tag}
        if volume:
            data["volume"] = volume
        url = self.generate_url("/tag/purge")
        return self._perform_post(url, data, content_type="application/vnd.sf.tag.purge+json")

    ##################### Archive  ##############################

    def get_archive_job(self, jobid):
        """given a jobid, return json structure about the job
        :return: json job information"""
        url = self.generate_url(f"/archive/job/{jobid}")
        return self._perform_get(url)

    # get upload job information
    def get_upload_job(self, jobid):
        """given an upload (low level) jobid, return the json for it
        :return: json job result"""
        url = self.generate_url(f"/job/{jobid}")
        return self._perform_get(url)

    # get a mapping of archive targets and credentials for bucket access
    def fetch_archive_targets(self):
        """create a cache of all archive target to bucket mappings along
           with credentials so that they can be used for bucket access,
           e.g. via triggerrestore/sqsrestore
        :returns: nothing"""
        url = self.generate_url("/archive/target/?obfuscate=false")
        tgt_list = self._perform_get(url)

        for atarg in tgt_list:
            if atarg["type"] == "s3":
                target = atarg["name"]
                bname = atarg["params"]["bucket_name"]
                self._buckets_[bname] = {}
                self._buckets_[target] = {}
                for k, v in atarg["params"].items():
                    if k == "verify_ssl":
                        # it's a string, get it to bool
                        val = bool(v.capitalize() == "True")
                        self._buckets_[bname]["verify"] = val
                        self._buckets_[target]["verify"] = val
                    else:
                        self._buckets_[bname][k] = v
                        self._buckets_[target][k] = v
                    self._buckets_[bname]["target"] = target
                    self._buckets_[target]["bucket"] = bname

    def get_buckets(self):
        """return bucket mapping dict"""
        return self._buckets_

    def get_archive_target_list(self):
        """Retrieves the lists of archive targets available
        :return: body of result
        """
        url = self.generate_url("/archive/target")
        return self._perform_get(url)

    def create_archive_target(self, name, target, verify=False):
        """Creats a new archive target, this is the internal abstraction, use create_archive_target_XXX
        for the individual types you can do.
        :return: json structure
        """
        url = self.generate_url("/archive/target")
        post_data = {
            "name": name,
            "type": target.sf_type(),
            "verify": verify,
            "params": target,
        }
        return self._perform_post(url, post_data)

    def delete_archive_target(self, target_id):
        """deletes the archive target
        :param target_id: the archive target's id
        :type target_id: string
        :return: success or failure of delete
        :rtype: boolean
        """

        url = self.generate_url(f"/archive/target/{target_id}")
        return self._perform_delete(url)

    def run_archive(
        self,
        source,
        archive_target,
        dest_path,
        compression_type="",
        compression_level=None,
        from_scratch=False,
        prescan_enabled=True,
        storage_class="",
        migrate=False,
        prescan_type="mtime",
        remove_empty_dirs=False,
        generate_manifest=None,
        generate_manifests=None,
        dedup=False,
        tar=False,
        verbose=False,
        query=None,
    ):
        """Runs an archive job

        :param source: volume and path in as volume:path
        :type source: string
        :param archive_target: 	Archive target name defined with /api/archive/target API
        :type archive_target: string
        :param dest_path: Destination dir path appended to archive_target dest_path
        :type dest_path: string
        :param compression_type: During upload to object store compress file contents. Ignored if used without
        compression_type. Valid choices are xz, gzip
        :type compression_type: string, optional
        :param compression_level: Set compression level. Defaults to 1 for xz and 6 for gzip.
        :type compression_level: int, optional
        :param from_scratch: Force job to run archive on all matching entries, even if they are already archived,
        defaults to False
        :type from_scratch: bool, optional
        :param prescan_enabled: Enable or Disable filesystem prescanning, defaults to True
        :type prescan_enabled: bool, optional
        :param storage_class: Set the storage class of the target, defaults to ''
        :type storage_class: string, optional
        :param migrate: Remove files from source after copy to archive. If alias remove_source is given, the result is
        calculated as migrate OR remove_source. If the result is False, the option is ignored., defaults to False
        :type migrate: bool, optional
        :param prescan_type: Change prescan type, defaults to "mtime", valid options 'diff', 'mtime', 'sync'
        :type prescan_type: str, optional
        :param remove_empty_dirs: For each removed file remove also parent directory and in case of success it will be
        recursive up to the job root (job root will not be removed if it's volume root). Using this option without
        migrate is prohibited., defaults to False
        :type remove_empty_dirs: bool, optional
        :param generate_manifest: Generate a manifest file for a job.
        If not set, the default setting for upload/copy command will be used (true by default).
        :type generate_manifest: bool, optional
        :param generate_manifests: A legacy option that has no function and will be removed in the future;
        please use generate_manifest instead
        :type generate_manifests: bool, optional
        :param dedup: deduplicate files with same contents to object store (uses md5 of file content as uploaded object
        name). This option works for cloud storage only and is exclusive with 'tar'., defaults to False
        :type dedup: bool, optional
        :param tar: upload tar.gz archive of input files to object store instead of individual files. This option is
        exclusive with 'dedup'. Defaults to False
        :type tar: bool, optional
        :param verbose: run job command with DEBUG log level, defaults to False
        :type verbose: bool, optional
        :param query: Query filter for an archive job, defaults to None
        :type query: string
        :return: json structure
        """
        post_data = {
            "volume_and_path": source,
            "archive_target_name": archive_target,
            "dest_path": dest_path,
            "migrate": migrate,
            "remove_empty_dirs": remove_empty_dirs,
            "generate_manifest": generate_manifest,
            "verbose": verbose,
            "prescan_enabled": prescan_enabled,
            "prescan_type": prescan_type,
            "from_scratch": from_scratch,
        }

        if compression_type:
            post_data["compression_type"] = compression_type
            if compression_level:
                post_data["compression_level"] = compression_level

        if storage_class:
            post_data["storage_class"] = storage_class

        if dedup:
            post_data["dedup"] = dedup

        if tar:
            post_data["tar"] = tar

        if query:
            post_data["query"] = query

        url = self.generate_url("/archive/start")
        return self._perform_post(url, post_data)

    ##################### Scans ##############################

    def get_scans(
        self,
        sort_order=1,
        limit=None,
        paging_offset=None,
        running=None,
        volume=None,
        state=None,
        type=None,
    ):
        """gets a list of scan jobs
        :param sort_order: -1 for descending order and 1 for ascending, defaults to 1
        :type sort_order: int, optional
        :param limit: limits the number of return results per page
        :type limit: int, optional
        :param paging_offset: Parameter that describes paging offset. It should be equal to number of entries that have
        been already printed on the previous pages.
        :type paging_offset: int, optional
        :param running: if set to true, lists only running scans; if set to false, lists only not-running (completed)
        scans; cannot be used together with state
        :type running: boolean, optional
        :param volume: name of the volume you wish to restrict the return to
        :type volume: string, optional
        :param state: Scan state(s), either a list of states or a single state as a string. Cannot be used together
        with running.
        :type state: list, optional
        :param type: [description], defaults to None
        :type type: The type of scan to filter for. (diff or mtime)
        :return: body of result
        """
        if state is None:
            state = []
        query_params = self._clean_dict(locals())  # grab the variables now so that we don't end up with extras later
        url = self.generate_url("/scan/")

        url = f'{url}?{urlencode(query_params, safe="")}'
        return self._perform_get(url)

    def create_new_scan(self, volume_name, scan_type, start_point="", overlapping_check_disabled=False):
        """Scan Volumes

        :param volume_name: The volume that you wish to scan
        :type volume_name: str
        :param scan_type: choice of "diff", "mtime", or "sync"
        :type scan_type: str
        :param start_point: where to start scanning in the volume, defaults to "" which is the root
        :type start_point: str, optional
        :param overlapping_check_disabled:if set, scan is started without checking if it overlaps already pending scan;
            in rare cases using this parameter can lead to a broken tree structure in the database which can be fixed
            only by performing a sync scan; if really necessary, it is recommended to use this option for small jobs
            (e.g. refreshing directories in UI: depth = 0) , defaults to False
        :type overlapping_check_disabled: bool, optional
        :return: json structure
        """
        if scan_type not in ["diff", "mtime", "sync"]:
            raise ValueError(f"Invalid scan type of '{scan_type}'. Must be one of 'diff', 'mtime', or 'sync'")

        url = self.generate_url("/scan/")
        post_data = {
            "volume": volume_name,
            "type": scan_type,
            "overlapping_check_disabled": overlapping_check_disabled,
            "requested_by": "client",
            "crawler_options": {
                "startpoint": start_point,
            },
        }
        return self._perform_post(url, post_data)

    def get_scan(self, id):
        """Get the details of a scan by scan id

        :param id: The scan id
        :type id: str
        :return: json structure
        """
        url = self.generate_url(f"/scan/{id}")
        return self._perform_get(url)

    ##################### Manifest  ##############################

    def create_manifest(self, job_id, archive_job=False, low_level_job=False, csv=False):
        """Starts the async process to create a manifest.
        :param job_id: Job id for which manifest should be created.
        :param archive_job: Value true means that job_id must be an archive job id. May not be used with parameter
        low_level_job=true
        :param low_level_job: Value true means that job_id must be a low-level job id. May not be used with parameter
        archive_job=true
        :param csv: When true then result will be prepared in CSV format. Default False and prepares JSON format.
        :return: json structure
        """
        prefix = f"/async/manifest/{job_id}?&archive_job={archive_job}&low_level_job={low_level_job}&csv={csv}"
        url = self.generate_url(prefix)
        return self._perform_post(url, None)


class SFUtil:
    """Class for utility functions that aren't related to Starfish API"""

    @staticmethod
    def parse_capacity(capacity_string):
        """Takes in a string for a capacity with unit and returns integer bytes
        Supports B, K, M, G, T, P, KB, MB, GB, TB, PB, KiB, MiB, GiB, TiB, PiB
        Single caracter units are consider SI (powers of 10)

        returns integer
        """

        try:
            bytes = int(capacity_string)
            # string is already a number, just return
            return bytes
        except ValueError:
            pass

        bytes = None
        match = re.match(r"^(\d+)\s*(.*)", capacity_string)
        if not match:
            raise ValueError(f"Unable to convert string '{capacity_string}' to bytes")

        (size, unit) = match.groups()
        size = int(size)

        short_units = ["K", "M", "G", "T", "P"]
        si_units = ["KB", "MB", "GB", "TB", "PB"]
        binary_units = ["KiB", "MiB", "GiB", "TiB", "PiB"]
        if unit == "B":
            bytes = int(size)
        elif unit in short_units:
            power = short_units.index(unit) + 1
            bytes = size * math.pow(1000, power)
        elif unit in si_units:
            power = si_units.index(unit) + 1
            bytes = size * math.pow(1000, power)
        elif unit in binary_units:
            power = binary_units.index(unit) + 1
            bytes = size * math.pow(1024, power)
        else:
            raise ValueError(f"Unknown unit {unit}. Unable to parse {capacity_string}")

        if not bytes:
            raise ValueError(f"Unable to convert string '{capacity_string}' to bytes")

        return int(bytes)
