/* legal disclaimer in /opt/starfish/data/starfish/sql-copyright-and-license.md */
-- Most of this query is common with detailed_directory_churn_redash_query.sql - make sure that they stay synced
WITH first_records AS (
    SELECT
        path,
        MIN(LOWER(valid)) AS "min_scan_time"
    FROM sf.dir
    WHERE path LIKE '{{{path_prefix}}}%'
        AND volume_id = sf.volume_id_from_name('{{{volume}}}')
    GROUP BY path
), paths AS (
    SELECT sf.dir.volume_id,
           case WHEN u.name IS NULL
                THEN sf.dir.uid::text
                ELSE u.name
           END as username,
           path,
           (local_aggrs->'total'->>'size')::BIGINT AS total_size,
           (local_aggrs->'total'->>'files')::BIGINT AS total_files,
           valid
    FROM sf.dir
    LEFT JOIN sf.uid_mapping u ON u.uid = sf.dir.uid AND u.volume_id = sf.dir.volume_id
    WHERE path LIKE '{{{path_prefix}}}%'
        AND sf.dir.volume_id = sf.volume_id_from_name('{{{volume}}}')
        AND LOWER(valid) > CURRENT_TIMESTAMP - INTERVAL '{{number_of_days_to_look_back}} days'
), artificial_rows_for_removed_dirs AS (
    -- We cannot just simply compare one dir row with other dir row to calculate churn.
    -- Assume that user has directory with total size 30 GB, removes it and after some time
    -- create directory with the same name but this time with size 50 GB
    -- sf.dir will contains two rows:
    -- 1. path='foo/bar', valid=[N, N + 1), size=30 GB
    -- 2. path='foo/bar', valid=[N + 2, inf), size=50 GB
    -- We need to convert that into 3 rows to express the fact that dir didn't exist from N + 1 to N + 2:
    -- 1. path='foo/bar', scan_time=N, size=30 GB
    -- 2. path='foo/bar', scan_time=N + 1, size=0 GB <- artificial row for removed dir
    -- 3. path='foo/bar', scan_time=N + 2, size=50 GB
    -- This SQL below adds artificial row with size 0 to make it possible
    SELECT * FROM (
        SELECT path,
               username,
               0::BIGINT AS total_size,
               0::BIGINT AS total_files,
               LAG(UPPER(valid), 1) OVER w AS scan_time,
               LAG(UPPER(valid), 1) OVER w != LOWER(valid) as is_new_row_after_remove
        FROM paths
        WINDOW w AS (PARTITION BY path ORDER BY LOWER(valid))
    ) t
    WHERE is_new_row_after_remove
), paths_with_artificial_rows_for_removed AS (
    SELECT path,
           username,
           total_size,
           total_files,
           LOWER(valid) AS scan_time
    FROM paths

    UNION

    SELECT path,
           username,
           total_size,
           total_files,
           scan_time
    FROM artificial_rows_for_removed_dirs
), detailed_churn_per_dir AS (
    SELECT paths.path,
           username,
           COALESCE(
               total_size - lag(total_size, 1) OVER w,
               -- Reports are used in two ways: for recent changes, and for the
               -- whole volume history. In the second case, churn includes
               -- the initial scan. For recent changes, it's a minor glitch that
               -- first data point is not included in churn/delta.
               CASE WHEN scan_time = min_scan_time
                   THEN total_size
                   ELSE NULL
               END)
               AS delta_size,
           COALESCE(
               total_files - lag(total_files, 1) OVER w,
               CASE WHEN scan_time = min_scan_time
                   THEN total_files
                   ELSE NULL
               END)
                AS delta_files,
           scan_time
        FROM paths_with_artificial_rows_for_removed AS paths
            JOIN first_records AS fr ON paths.path = fr.path
        WINDOW w AS (PARTITION BY paths.path ORDER BY scan_time)
) SELECT '<a href="queries/QUERY_ID_FOR_REPORT(detailed_directory_churn)?p_volume={{{volume}}}&p_dir_path=' || path || '&p_days_back={{number_of_days_to_look_back}}">Details</a>' AS " ",
         path,
         username as user,
         ROUND(SUM(ABS(delta_size)) / (1024.0 * 1024 * 1024), 2) AS "churn (GiB)",
         SUM(ABS(delta_files))::BIGINT AS "churn (number of files)"
    FROM detailed_churn_per_dir
    GROUP BY path, username
    HAVING SUM(ABS(delta_size)) / (1024.0 * 1024 * 1024) >= {{cutoff_gib}}
    ORDER BY "churn (GiB)" DESC
    LIMIT 200
