#!/opt/starfish/examples/venv/bin/python3
"""
***********************************************************************************************************

 Starfish Storage Corporation ("Starfish") CONFIDENTIAL
 Unpublished Copyright (c) 2011 - present Starfish Storage Corporation, All Rights Reserved.

 NOTICE: This file and its contents (1) constitute Starfish's "External Code" under Starfish's most-recent
 Limited Software End-User License Agreement, and (2) is and remains the property of Starfish. The
 intellectual and technical concepts contained herein are proprietary to Starfish and may be covered by
 U.S. and/or foreign patents or patents in process, and are protected by trade secret or copyright law.
 Dissemination of this information or reproduction of this material is strictly forbidden unless prior
 written permission is obtained from Starfish. Access to the source code contained herein is hereby
 forbidden to anyone except (A) current Starfish employees, managers, or contractors who have executed
 confidentiality or nondisclosure agreements explicitly covering such access, and (B) licensees of
 Starfish's software.

 ANY REPRODUCTION, COPYING, MODIFICATION, DISTRIBUTION, PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR
 THROUGH USE OF THIS SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF STARFISH IS STRICTLY PROHIBITED
 AND IS IN VIOLATION OF APPLICABLE LAWS AND INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS
 FILE OR ITS CONTENTS AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS TO REPRODUCE,
 DISCLOSE, OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN
 WHOLE OR IN PART.

 FOR U.S. GOVERNMENT CUSTOMERS REGARDING THIS DOCUMENTATION/SOFTWARE
   These notices shall be marked on any reproduction of this data, in whole or in part.
   NOTICE: Notwithstanding any other lease or license that may pertain to, or accompany the delivery of,
   this computer software, the rights of the Government regarding its use, reproduction and disclosure are
   as set forth in Section 52.227-19 of the FARS Computer Software-Restricted Rights clause.
   RESTRICTED RIGHTS NOTICE: Use, duplication, or disclosure by the Government is subject to the
   restrictions as set forth in subparagraph (c)(1)(ii) of the Rights in Technical Data and Computer
   Software clause at DFARS 52.227-7013.

***********************************************************************************************************
last modified 2021-09-01 - doug
  - fix to venv; add --help
"""
import argparse
import os
import sys

import charset_normalizer

# This script is used to detect encoding of files in a directory, using charser_normalizer
# It will print out detected encoding per file, and confidence level

# To run:
# This script requires the mount path of the volume as its only argument. It won't work
# with multi-volume execution
#
# example usage:
# find all directories that have non-utf8 file warnings and then pipe
# the dir names into charset_normalizer and it will print things that it recognizes
# that are not standard encoding
# the argument to detect_encoding is the mount point of the starfish volume
# being checked
# sf query MyVol: --type d --error item_non_utf8_name --print0 -H --format full_path | ./detect_encoding.py /mnt/myvol
# for single directory let's say it's in MyVol (mounted at /mnt/myvol) with file
# path/to/dir/with/junk; example usage:
# echo -ne "path/to/dir/with/junk" | /opt/starfish/bin/tools/detect_encoding.py /mnt/myvol


def main():
    description_text = r"""
This program attempts to detect the charset encoding of files with
names that are non-UTF8-8 compliant. It does this using the Python
charset_normalizer module. Note: it only print out a result (the encoding)
for files that actually have an unusual decoding. Normal (ASCII/UTF-8)
files are skipped and have no output.

Normal usage is to pass it a directory that has non-UTF8 file names
in it (from starfish query) and have it tell you which ones.

Example 1:
  sf query MyVol: --type d --error item_non_utf8_name --print0 -H --format full_path | \
    /opt/starfish/bin/tools/detect_encoding.py /mnt/myvol

Since the query will include the volume relative path, you must supply
the mount point of the volume to detect_encoding.py so it can look at
the file names in the correct, fully-qualified way.

Example2: outside of Starfish, run on a specific directory. The same
restrictions apply. Use the volume mount point at the end and use
echo -ne to pass in a null separate list of paths relative to that mount.

  echo -ne 'path/to/dir/with/junk' | \
    /opt/starfish/bin/tools/detect_encoding.py /mnt/myvol

Also valid:

  echo -ne '/my/full/dir\\0/my/other/dir' | \
    /opt/starfish/bin/tools/detect_encoding.py /
"""

    parser = argparse.ArgumentParser(description=description_text, formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument("mount_point", metavar="MOUNTPOINT", nargs=1, help="the mountpoint of the volume")

    parser.parse_args()
    args = parser.parse_args()

    prefix = args.mount_point[0]

    for directory in sys.stdin.read().split("\0"):
        print(f"checking dir {prefix}/{directory}")
        dirpath = bytes(prefix + "/" + directory, "utf-8")
        for file in os.listdir(dirpath):
            detected = charset_normalizer.detect(file)
            if detected["encoding"] != "ascii":
                if detected["encoding"] == "None":
                    # no idea, print the file for job result (and then None)
                    print(file)
                else:
                    # try using the encoding to print file name
                    try:
                        print(file.decode(detected["encoding"]))
                    except Exception:
                        print(file)
                print(detected)


if __name__ == "__main__":
    sys.exit(main())
