/* legal disclaimer in /opt/starfish/data/starfish/sql-copyright-and-license.md */

WITH ajobs AS (
    SELECT
        volume_id,
        target_id,
        archive_target_name,
        status,
        creation_time,
        end_time,
        EXTRACT(EPOCH FROM (end_time - creation_time)) AS time,
        jsonb_array_elements_text(low_level_jobs::jsonb->'UPLOADING_FILES')::BIGINT AS llj
    FROM sf_archive.archive_job
    WHERE creation_time >= NOW() - INTERVAL '{{number_of_days_to_look_back}} day'
),
jobs AS (
    SELECT
        ajobs.volume_id,
        vol.name,
        ajobs.target_id,
        ajobs.archive_target_name,
        at.type AS target_type,
        ajobs.status,
        ajobs.creation_time,
        ajobs.end_time,
        ajobs.time,
        ajobs.llj,
        ROUND(((sji.fs_stats->>'fs_bytes_done')::BIGINT / (1000 * 1000.0)), 2) AS bytes,
        (sji.fs_stats->>'fs_entries_done')::BIGINT AS files_count,
        CASE WHEN ajobs.time != 0
            THEN (sji.fs_stats->>'fs_bytes_done')::BIGINT / (1000 * 1000.0) / ajobs.time
        ELSE NULL
        END AS "MB/s"
    FROM ajobs
    INNER JOIN sf_dispatcher.incarnation sji ON ajobs.llj = sji.job_id
    LEFT JOIN sf_archive.archive_target at ON ajobs.target_id = at.id
    LEFT JOIN sf_volumes.volume vol ON vol.id = ajobs.volume_id
),
results AS (
    SELECT
        bytes,
        files_count,
        count(*) as group_rows_count,
        ROUND(AVG("MB/s")::numeric, 2) AS AVG,
        ROUND(MAX("MB/s")::numeric, 2) AS MAX,
        ROUND((PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY "MB/s"))::numeric, 2) AS median
    FROM jobs
    GROUP BY bytes,files_count
),
win_funcs AS (
    SELECT
        bytes,
        files_count,
        group_rows_count,
        avg,
        LEAD(avg) OVER(ORDER BY bytes DESC ,files_count DESC)::FLOAT AS lead_avg,
        max,
        LEAD(max) OVER(ORDER BY bytes DESC ,files_count DESC)::FLOAT AS lead_max,
        median,
        LEAD(median) OVER(ORDER BY bytes DESC ,files_count DESC) AS lead_median
    FROM results
),
report AS (
    SELECT
        --CONCAT(ROUND((bytes::BIGINT/(1000*1000.0)),2)::varchar,' , ',files_count) AS "MB/files count",
        bytes,
        files_count AS "files count",
        group_rows_count AS "repetitions",
        avg,
        CASE
        WHEN ROUND((avg - lead_avg)::NUMERIC,2) > 0 THEN concat('+',ROUND((avg - lead_avg)::NUMERIC,2)::VARCHAR)
        ELSE ROUND((avg - lead_avg)::NUMERIC,2)::VARCHAR
        END AS "difference in AVG",
        ROUND(avg * 1.1, 2) AS avg_log,
        max,
        CASE
            WHEN ROUND((avg - lead_max)::NUMERIC,2) > 0 THEN concat('+',ROUND((avg - lead_max)::NUMERIC,2)::VARCHAR)
            ELSE ROUND((avg - lead_max)::NUMERIC,2)::VARCHAR
        END AS "difference in MAX",
        median,
        CASE
            WHEN ROUND((median - lead_median)::NUMERIC,2) > 0 THEN concat('+',ROUND((median - lead_median)::NUMERIC,2)::VARCHAR)
            ELSE ROUND((median - lead_median)::NUMERIC,2)::VARCHAR
        END AS "difference in MEDIAN"
   FROM win_funcs
   WHERE bytes IS NOT NULL
   ORDER BY bytes DESC ,files_count DESC
),
new_report AS (
    SELECT
        bytes,
        avg,
        avg as avg2,
        "difference in AVG" AS "+/- diff",
        ROUND(REGEXP_REPLACE("difference in AVG", '^[+-]', '')::NUMERIC, 2) AS diff
    FROM report
),
ranges AS (
    SELECT '0' AS range_start, '1M' AS range_end, '0-1M' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes >= 0 AND bytes <= 1

    UNION ALL
    SELECT '1M' AS range_start, '10M' AS range_end, '1-10M' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes > 1 AND bytes <= 10

    UNION ALL
    SELECT '10M' AS range_start, '100M' AS range_end, '10-100M' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes > 10 AND bytes <= 100

    UNION ALL
    SELECT '100M' AS range_start, '1G' AS range_end, '100M-1G' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes > 100 AND bytes <= 1024

    UNION ALL
    SELECT '1G' AS range_start, '10G' AS range_end, '1-10G' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes > 1024 AND bytes <= 10 * 1024

    UNION ALL
    SELECT '10G' AS range_start, '100G' AS range_end, '10-100G' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes > 10 * 1024 AND bytes <= 100 * 1024

    UNION ALL
    SELECT '100G' AS range_start, 'infinity' AS range_end, '100G-infinity' AS mid_point, ROUND(AVG(bytes), 2) AS avg_size, SUM(bytes) AS byte_sum, ROUND(AVG(avg), 2) AS avg_throughput, AVG(diff) AS diff
    FROM new_report WHERE bytes > 100 * 1024
),
calc_log_base AS (
    SELECT POWER(MAX(byte_sum), 1/100::NUMERIC) AS base FROM ranges
),
log_base AS (
    SELECT
        CASE WHEN base = 1 OR base <= 0 THEN 2
        ELSE base END AS base
    FROM calc_log_base
)

SELECT
    range_start,
    range_end,
    mid_point,
    avg_size,
    CASE WHEN byte_sum > 0 THEN
      LOG((SELECT base FROM log_base), byte_sum)
    ELSE 0
    END AS log_size,
    avg_throughput,
    diff
FROM ranges
