From b824cf0e1f4101b4327a874ec1ca715470374a5f Mon Sep 17 00:00:00 2001 From: reasv <7143787+reasv@users.noreply.github.com> Date: Tue, 30 Mar 2021 05:02:49 +0200 Subject: [PATCH] Sha256 image deduplication --- Cargo.lock | 1 + Cargo.toml | 1 + .../20210329215000_create_image_backlog.sql | 5 +- sqlx-data.json | 890 +++++++++++------- src/api.rs | 14 +- src/db.rs | 131 ++- src/frontend.rs | 17 +- src/http.rs | 24 +- src/image_archiver.rs | 61 +- src/models.rs | 24 +- src/templates/post_file.html | 6 +- src/thread_archiver.rs | 26 +- src/util.rs | 24 +- 13 files changed, 733 insertions(+), 491 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 27ac2e4..ce2e13f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1387,6 +1387,7 @@ dependencies = [ "rust-embed", "serde", "serde_json", + "sha2", "sqlx", "tokio", "tokio-util", diff --git a/Cargo.toml b/Cargo.toml index c0af500..1e5f499 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ handlebars_misc_helpers = "0.11.2" sqlx = { version = "0.5", features = [ "postgres", "macros", "migrate", "runtime-tokio-rustls", "offline" ] } rust-embed = "5.9.0" mime_guess = "2.0.3" +sha2 = "0.9.3" [profile.release] lto = true diff --git a/migrations/20210329215000_create_image_backlog.sql b/migrations/20210329215000_create_image_backlog.sql index 536300c..20b1f54 100644 --- a/migrations/20210329215000_create_image_backlog.sql +++ b/migrations/20210329215000_create_image_backlog.sql @@ -1,11 +1,10 @@ CREATE TABLE image_backlog ( id BIGSERIAL PRIMARY KEY, board TEXT NOT NULL, - no TEXT NOT NULL, + no BIGINT NOT NULL, url TEXT NOT NULL, thumbnail_url TEXT NOT NULL, - filename TEXT NOT NULL, - thumbnail_filename TEXT NOT NULL, + ext TEXT NOT NULL, page INTEGER NOT NULL, file_sha256 TEXT NOT NULL, thumbnail_sha256 TEXT NOT NULL, diff --git a/sqlx-data.json b/sqlx-data.json index 9efc0fd..c127cb4 100644 --- a/sqlx-data.json +++ b/sqlx-data.json @@ -1,7 +1,166 @@ { "db": "PostgreSQL", - "022f8eb14f45e7a392cbb94815ae7d8fc3804b4d5b386da18b301e7060e35183": { - "query": "\n INSERT INTO posts(\n board, -- 1\n no, -- 2\n resto, -- 3\n sticky, -- 4\n closed, -- 5\n now, -- 6\n time, -- 7\n name, -- 8\n trip, -- 9\n id, -- 10\n capcode, -- 11\n country, -- 12\n country_name, -- 13\n sub, -- 14\n com, -- 15\n tim, -- 16\n filename, -- 17\n ext, -- 18\n fsize, -- 19\n md5, -- 20\n w, -- 21\n h, -- 22\n tn_w, -- 23\n tn_h, -- 24\n filedeleted, -- 25\n spoiler, -- 26\n custom_spoiler, -- 27\n replies, -- 28\n images, -- 29\n bumplimit, -- 30\n imagelimit, -- 31\n tag, -- 32\n semantic_url, -- 33\n since4pass, -- 34\n unique_ips, -- 35\n m_img, -- 36\n archived, -- 37\n archived_on, -- 38\n last_modified -- 39\n )\n VALUES\n ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, \n $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39)\n ON CONFLICT (board, no) DO \n UPDATE \n SET\n closed = $5,\n sticky = $4,\n filedeleted = $25,\n replies = $28,\n images = $29,\n bumplimit = $30,\n imagelimit = $31,\n unique_ips = CASE WHEN posts.unique_ips < $35 THEN $35 ELSE posts.unique_ips END,\n archived = $37,\n archived_on = $38,\n last_modified = $39\n\n WHERE posts.board = $1 AND posts.no = $2\n RETURNING *;\n ", + "0247327e65e5299612e5f78632b41ef83a0c668f23d2fca992a385b0fd61ea98": { + "query": "SELECT * FROM boards ORDER BY name ASC", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "name", + "type_info": "Text" + }, + { + "ordinal": 1, + "name": "full_images", + "type_info": "Bool" + }, + { + "ordinal": 2, + "name": "archive", + "type_info": "Bool" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + false, + false, + false + ] + } + }, + "2628c4bc1e057d003cb7af29bd1ef8a0c3ad57ecf099ec79f10f50c73dc75cf7": { + "query": "\n SELECT * FROM thread_backlog\n ORDER BY (page, id) ASC\n LIMIT $1\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "board", + "type_info": "Varchar" + }, + { + "ordinal": 2, + "name": "no", + "type_info": "Int8" + }, + { + "ordinal": 3, + "name": "last_modified", + "type_info": "Int8" + }, + { + "ordinal": 4, + "name": "replies", + "type_info": "Int8" + }, + { + "ordinal": 5, + "name": "page", + "type_info": "Int4" + } + ], + "parameters": { + "Left": [ + "Int8" + ] + }, + "nullable": [ + false, + false, + false, + false, + false, + false + ] + } + }, + "72ba56d05d319b7aed233f7817ff018ab308eefcf61c0ec197fa521309f35d2d": { + "query": "\n SELECT t1.resto FROM posts t1\n LEFT JOIN posts t2\n ON t1.resto = t2.resto AND t1.no < t2.no\n WHERE t2.no IS NULL and t1.board = $1\n ORDER BY t1.no DESC OFFSET $2 LIMIT $3\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "resto", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [ + "Text", + "Int8", + "Int8" + ] + }, + "nullable": [ + false + ] + } + }, + "8c62e961e17500ed6ae2ae6b346fd3c8240863578c981a4a1ee44974284146a2": { + "query": "DELETE FROM boards WHERE name = $1", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Text" + ] + }, + "nullable": [] + } + }, + "95ed3315e557731aec7f4e384760994a9764d9680a3da57424fcef3b1c2ffaa6": { + "query": "SELECT * FROM boards WHERE name = $1", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "name", + "type_info": "Text" + }, + { + "ordinal": 1, + "name": "full_images", + "type_info": "Bool" + }, + { + "ordinal": 2, + "name": "archive", + "type_info": "Bool" + } + ], + "parameters": { + "Left": [ + "Text" + ] + }, + "nullable": [ + false, + false, + false + ] + } + }, + "9773d3cdf3f0e7554056e00fd10b9e589dcc0e94e4a7f37cc440de69152f8582": { + "query": "\n DELETE FROM posts WHERE board = $1 AND no = $2\n ", + "describe": { + "columns": [], + "parameters": { + "Left": [ + "Text", + "Int8" + ] + }, + "nullable": [] + } + }, + "b38f0f9c75b8de884a076b9ea794fcb9cea76fe8c95b99b57fe81e8867e81db3": { + "query": "\n SELECT *\n FROM posts\n WHERE board = $1\n AND tim = $2\n ", "describe": { "columns": [ { @@ -203,48 +362,21 @@ "ordinal": 39, "name": "last_modified", "type_info": "Int8" + }, + { + "ordinal": 40, + "name": "file_sha256", + "type_info": "Text" + }, + { + "ordinal": 41, + "name": "thumbnail_sha256", + "type_info": "Text" } ], "parameters": { "Left": [ - "Varchar", - "Int8", - "Int8", - "Int8", - "Int8", - "Text", - "Int8", - "Text", - "Text", - "Varchar", - "Text", - "Varchar", - "Text", - "Text", - "Text", - "Int8", - "Text", - "Text", - "Int8", - "Text", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", "Text", - "Text", - "Int8", - "Int8", - "Int8", - "Int8", - "Int8", "Int8" ] }, @@ -288,47 +420,19 @@ false, false, false, - false - ] - } - }, - "0247327e65e5299612e5f78632b41ef83a0c668f23d2fca992a385b0fd61ea98": { - "query": "SELECT * FROM boards ORDER BY name ASC", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "name", - "type_info": "Text" - }, - { - "ordinal": 1, - "name": "full_images", - "type_info": "Bool" - }, - { - "ordinal": 2, - "name": "archive", - "type_info": "Bool" - } - ], - "parameters": { - "Left": [] - }, - "nullable": [ false, false, false ] } }, - "2628c4bc1e057d003cb7af29bd1ef8a0c3ad57ecf099ec79f10f50c73dc75cf7": { - "query": "\n SELECT * FROM thread_backlog\n ORDER BY (page, id) ASC\n LIMIT $1\n ", + "b4aad47a26fa2a0a994fd508d50becec7cf118bfb7537d5ac199ec987c9481aa": { + "query": "\n UPDATE posts\n SET \n file_sha256 = $1,\n thumbnail_sha256 = $2\n WHERE board = $3 AND no = $4\n RETURNING *\n ", "describe": { "columns": [ { "ordinal": 0, - "name": "id", + "name": "post_id", "type_info": "Int8" }, { @@ -343,178 +447,243 @@ }, { "ordinal": 3, - "name": "last_modified", + "name": "resto", "type_info": "Int8" }, { "ordinal": 4, - "name": "replies", + "name": "sticky", "type_info": "Int8" }, { "ordinal": 5, - "name": "page", - "type_info": "Int4" - } - ], - "parameters": { - "Left": [ - "Int8" - ] - }, - "nullable": [ - false, - false, - false, - false, - false, - false - ] - } - }, - "3c5777d4483344b8916c088e602a76f235674ae47fe87e00fcd06f40795977d5": { - "query": "\n DELETE FROM images WHERE md5 = $1\n ", - "describe": { - "columns": [], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [] - } - }, - "4b7419c7ec80b5476051406ac6be2592e17a8d8d4a10772bf39a53dc112ea705": { - "query": "\n SELECT *\n FROM images\n WHERE md5 = $1\n ", - "describe": { - "columns": [ + "name": "closed", + "type_info": "Int8" + }, { - "ordinal": 0, - "name": "md5", + "ordinal": 6, + "name": "now", "type_info": "Text" }, { - "ordinal": 1, - "name": "md5_base32", + "ordinal": 7, + "name": "time", + "type_info": "Int8" + }, + { + "ordinal": 8, + "name": "name", "type_info": "Text" }, { - "ordinal": 2, - "name": "thumbnail", - "type_info": "Bool" + "ordinal": 9, + "name": "trip", + "type_info": "Text" }, { - "ordinal": 3, - "name": "full_image", - "type_info": "Bool" - } - ], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [ - false, - false, - false, - false - ] - } - }, - "550b52ba0863d40e0ca5e5e6648c15281ea9826f99081549af908e6db0b61b34": { - "query": "\n INSERT INTO images (md5, md5_base32, thumbnail, full_image)\n VALUES ($1, $2, $3, $4)\n ON CONFLICT(md5) DO \n UPDATE SET\n thumbnail = $3,\n full_image = $4\n WHERE images.md5 = $1\n RETURNING *;\n ", - "describe": { - "columns": [ + "ordinal": 10, + "name": "id", + "type_info": "Varchar" + }, { - "ordinal": 0, - "name": "md5", + "ordinal": 11, + "name": "capcode", "type_info": "Text" }, { - "ordinal": 1, - "name": "md5_base32", + "ordinal": 12, + "name": "country", + "type_info": "Varchar" + }, + { + "ordinal": 13, + "name": "country_name", "type_info": "Text" }, { - "ordinal": 2, - "name": "thumbnail", - "type_info": "Bool" + "ordinal": 14, + "name": "sub", + "type_info": "Text" }, { - "ordinal": 3, - "name": "full_image", - "type_info": "Bool" - } - ], - "parameters": { - "Left": [ - "Text", - "Text", - "Bool", - "Bool" - ] - }, - "nullable": [ - false, - false, - false, - false - ] - } - }, - "6e839d64c85d5c025a436a4cdc75379400ec2a6a2032ccf195e746060e1c3cb3": { - "query": "\n SELECT *\n FROM image_backlog\n WHERE board = $1\n AND md5 = $2\n ", - "describe": { - "columns": [ + "ordinal": 15, + "name": "com", + "type_info": "Text" + }, { - "ordinal": 0, - "name": "id", + "ordinal": 16, + "name": "tim", "type_info": "Int8" }, { - "ordinal": 1, - "name": "md5", + "ordinal": 17, + "name": "filename", "type_info": "Text" }, { - "ordinal": 2, - "name": "md5_base32", + "ordinal": 18, + "name": "ext", "type_info": "Text" }, { - "ordinal": 3, - "name": "board", + "ordinal": 19, + "name": "fsize", + "type_info": "Int8" + }, + { + "ordinal": 20, + "name": "md5", "type_info": "Text" }, { - "ordinal": 4, - "name": "url", + "ordinal": 21, + "name": "w", + "type_info": "Int8" + }, + { + "ordinal": 22, + "name": "h", + "type_info": "Int8" + }, + { + "ordinal": 23, + "name": "tn_w", + "type_info": "Int8" + }, + { + "ordinal": 24, + "name": "tn_h", + "type_info": "Int8" + }, + { + "ordinal": 25, + "name": "filedeleted", + "type_info": "Int8" + }, + { + "ordinal": 26, + "name": "spoiler", + "type_info": "Int8" + }, + { + "ordinal": 27, + "name": "custom_spoiler", + "type_info": "Int8" + }, + { + "ordinal": 28, + "name": "replies", + "type_info": "Int8" + }, + { + "ordinal": 29, + "name": "images", + "type_info": "Int8" + }, + { + "ordinal": 30, + "name": "bumplimit", + "type_info": "Int8" + }, + { + "ordinal": 31, + "name": "imagelimit", + "type_info": "Int8" + }, + { + "ordinal": 32, + "name": "tag", "type_info": "Text" }, { - "ordinal": 5, - "name": "thumbnail_url", + "ordinal": 33, + "name": "semantic_url", "type_info": "Text" }, { - "ordinal": 6, - "name": "filename", + "ordinal": 34, + "name": "since4pass", + "type_info": "Int8" + }, + { + "ordinal": 35, + "name": "unique_ips", + "type_info": "Int8" + }, + { + "ordinal": 36, + "name": "m_img", + "type_info": "Int8" + }, + { + "ordinal": 37, + "name": "archived", + "type_info": "Int8" + }, + { + "ordinal": 38, + "name": "archived_on", + "type_info": "Int8" + }, + { + "ordinal": 39, + "name": "last_modified", + "type_info": "Int8" + }, + { + "ordinal": 40, + "name": "file_sha256", "type_info": "Text" }, { - "ordinal": 7, - "name": "thumbnail_filename", + "ordinal": 41, + "name": "thumbnail_sha256", "type_info": "Text" } ], "parameters": { "Left": [ "Text", - "Text" + "Text", + "Text", + "Int8" ] }, "nullable": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, false, false, false, @@ -526,87 +695,76 @@ ] } }, - "72ba56d05d319b7aed233f7817ff018ab308eefcf61c0ec197fa521309f35d2d": { - "query": "\n SELECT t1.resto FROM posts t1\n LEFT JOIN posts t2\n ON t1.resto = t2.resto AND t1.no < t2.no\n WHERE t2.no IS NULL and t1.board = $1\n ORDER BY t1.no DESC OFFSET $2 LIMIT $3\n ", + "b8a969fc70f660d9f9cf48d74f90c712a6d786525d14de33b90a8ae3f6e7f499": { + "query": "\n SELECT * FROM image_backlog WHERE id = $1\n ", "describe": { "columns": [ { "ordinal": 0, - "name": "resto", + "name": "id", "type_info": "Int8" - } - ], - "parameters": { - "Left": [ - "Text", - "Int8", - "Int8" - ] - }, - "nullable": [ - false - ] - } - }, - "8c62e961e17500ed6ae2ae6b346fd3c8240863578c981a4a1ee44974284146a2": { - "query": "DELETE FROM boards WHERE name = $1", - "describe": { - "columns": [], - "parameters": { - "Left": [ - "Text" - ] - }, - "nullable": [] - } - }, - "95ed3315e557731aec7f4e384760994a9764d9680a3da57424fcef3b1c2ffaa6": { - "query": "SELECT * FROM boards WHERE name = $1", - "describe": { - "columns": [ + }, { - "ordinal": 0, - "name": "name", + "ordinal": 1, + "name": "board", "type_info": "Text" }, { - "ordinal": 1, - "name": "full_images", - "type_info": "Bool" + "ordinal": 2, + "name": "no", + "type_info": "Int8" + }, + { + "ordinal": 3, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "thumbnail_url", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "ext", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "page", + "type_info": "Int4" + }, + { + "ordinal": 7, + "name": "file_sha256", + "type_info": "Text" }, { - "ordinal": 2, - "name": "archive", - "type_info": "Bool" + "ordinal": 8, + "name": "thumbnail_sha256", + "type_info": "Text" } ], "parameters": { "Left": [ - "Text" + "Int8" ] }, "nullable": [ + false, + false, + false, + false, + false, + false, false, false, false ] } }, - "9773d3cdf3f0e7554056e00fd10b9e589dcc0e94e4a7f37cc440de69152f8582": { - "query": "\n DELETE FROM posts WHERE board = $1 AND no = $2\n ", - "describe": { - "columns": [], - "parameters": { - "Left": [ - "Text", - "Int8" - ] - }, - "nullable": [] - } - }, - "b38f0f9c75b8de884a076b9ea794fcb9cea76fe8c95b99b57fe81e8867e81db3": { - "query": "\n SELECT *\n FROM posts\n WHERE board = $1\n AND tim = $2\n ", + "baa97e866337b2938d382cca3b779306092c3b3605da7113e2cf436e3b909213": { + "query": "\n SELECT *\n FROM posts\n WHERE board = $1\n AND (no = $2 OR resto = $2)\n ORDER BY no ASC\n ", "describe": { "columns": [ { @@ -808,6 +966,16 @@ "ordinal": 39, "name": "last_modified", "type_info": "Int8" + }, + { + "ordinal": 40, + "name": "file_sha256", + "type_info": "Text" + }, + { + "ordinal": 41, + "name": "thumbnail_sha256", + "type_info": "Text" } ], "parameters": { @@ -856,12 +1024,89 @@ false, false, false, + false, + false, false ] } }, - "baa97e866337b2938d382cca3b779306092c3b3605da7113e2cf436e3b909213": { - "query": "\n SELECT *\n FROM posts\n WHERE board = $1\n AND (no = $2 OR resto = $2)\n ORDER BY no ASC\n ", + "c54aab656738dcb088a2fe2ddc3234f5056c9ec6b02e80fe6b095f4345ca8349": { + "query": "\n INSERT INTO image_backlog (\n board, -- 1\n no, -- 2\n url, -- 3\n thumbnail_url, -- 4\n ext, -- 5\n page, -- 6\n file_sha256, -- 7\n thumbnail_sha256 -- 8\n )\n VALUES\n ($1, $2, $3, $4, $5, $6, $7, $8)\n ON CONFLICT(board, no) DO NOTHING\n RETURNING *;\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "board", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "no", + "type_info": "Int8" + }, + { + "ordinal": 3, + "name": "url", + "type_info": "Text" + }, + { + "ordinal": 4, + "name": "thumbnail_url", + "type_info": "Text" + }, + { + "ordinal": 5, + "name": "ext", + "type_info": "Text" + }, + { + "ordinal": 6, + "name": "page", + "type_info": "Int4" + }, + { + "ordinal": 7, + "name": "file_sha256", + "type_info": "Text" + }, + { + "ordinal": 8, + "name": "thumbnail_sha256", + "type_info": "Text" + } + ], + "parameters": { + "Left": [ + "Text", + "Int8", + "Text", + "Text", + "Text", + "Int4", + "Text", + "Text" + ] + }, + "nullable": [ + false, + false, + false, + false, + false, + false, + false, + false, + false + ] + } + }, + "cacdfe225dfca9bf7b66dc858a74450706da01c0ae957a96061684e773645cf4": { + "query": "\n INSERT INTO posts(\n board, -- 1\n no, -- 2\n resto, -- 3\n sticky, -- 4\n closed, -- 5\n now, -- 6\n time, -- 7\n name, -- 8\n trip, -- 9\n id, -- 10\n capcode, -- 11\n country, -- 12\n country_name, -- 13\n sub, -- 14\n com, -- 15\n tim, -- 16\n filename, -- 17\n ext, -- 18\n fsize, -- 19\n md5, -- 20\n w, -- 21\n h, -- 22\n tn_w, -- 23\n tn_h, -- 24\n filedeleted, -- 25\n spoiler, -- 26\n custom_spoiler, -- 27\n replies, -- 28\n images, -- 29\n bumplimit, -- 30\n imagelimit, -- 31\n tag, -- 32\n semantic_url, -- 33\n since4pass, -- 34\n unique_ips, -- 35\n m_img, -- 36\n archived, -- 37\n archived_on, -- 38\n last_modified, -- 39\n file_sha256, -- 40\n thumbnail_sha256 -- 41\n )\n VALUES\n ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, \n $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41)\n ON CONFLICT (board, no) DO \n UPDATE \n SET\n closed = $5,\n sticky = $4,\n filedeleted = $25,\n replies = $28,\n images = $29,\n bumplimit = $30,\n imagelimit = $31,\n unique_ips = CASE WHEN posts.unique_ips < $35 THEN $35 ELSE posts.unique_ips END,\n archived = $37,\n archived_on = $38,\n last_modified = $39\n\n WHERE posts.board = $1 AND posts.no = $2\n RETURNING *;\n ", "describe": { "columns": [ { @@ -1063,12 +1308,61 @@ "ordinal": 39, "name": "last_modified", "type_info": "Int8" + }, + { + "ordinal": 40, + "name": "file_sha256", + "type_info": "Text" + }, + { + "ordinal": 41, + "name": "thumbnail_sha256", + "type_info": "Text" } ], "parameters": { "Left": [ + "Varchar", + "Int8", + "Int8", + "Int8", + "Int8", "Text", - "Int8" + "Int8", + "Text", + "Text", + "Varchar", + "Text", + "Varchar", + "Text", + "Text", + "Text", + "Int8", + "Text", + "Text", + "Int8", + "Text", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Text", + "Text", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Int8", + "Text", + "Text" ] }, "nullable": [ @@ -1111,6 +1405,8 @@ false, false, false, + false, + false, false ] } @@ -1330,6 +1626,16 @@ "ordinal": 39, "name": "last_modified", "type_info": "Int8" + }, + { + "ordinal": 40, + "name": "file_sha256", + "type_info": "Text" + }, + { + "ordinal": 41, + "name": "thumbnail_sha256", + "type_info": "Text" } ], "parameters": { @@ -1378,6 +1684,8 @@ false, false, false, + false, + false, false ] } @@ -1428,74 +1736,6 @@ ] } }, - "e01c18090995d1d955630c125891c6c36ebe80cfa355fe39d01d15a33376823b": { - "query": "\n INSERT INTO image_backlog (md5, md5_base32, board, url, thumbnail_url, filename, thumbnail_filename)\n VALUES\n ($1, $2, $3, $4, $5, $6, $7)\n ON CONFLICT(board, md5) DO\n UPDATE SET\n url = $4,\n thumbnail_url = $5\n RETURNING *;\n ", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "id", - "type_info": "Int8" - }, - { - "ordinal": 1, - "name": "md5", - "type_info": "Text" - }, - { - "ordinal": 2, - "name": "md5_base32", - "type_info": "Text" - }, - { - "ordinal": 3, - "name": "board", - "type_info": "Text" - }, - { - "ordinal": 4, - "name": "url", - "type_info": "Text" - }, - { - "ordinal": 5, - "name": "thumbnail_url", - "type_info": "Text" - }, - { - "ordinal": 6, - "name": "filename", - "type_info": "Text" - }, - { - "ordinal": 7, - "name": "thumbnail_filename", - "type_info": "Text" - } - ], - "parameters": { - "Left": [ - "Text", - "Text", - "Text", - "Text", - "Text", - "Text", - "Text" - ] - }, - "nullable": [ - false, - false, - false, - false, - false, - false, - false, - false - ] - } - }, "e2a506eca1644ec72666c07852281f8a96cb4058ecbd4b539945c1c3fca72002": { "query": "\n INSERT INTO thread_backlog (board, no, last_modified, replies, page)\n VALUES\n ($1, $2, $3, $4, $5)\n ON CONFLICT(board, no, last_modified) DO\n UPDATE SET\n replies = $4,\n page = $5\n RETURNING *;\n ", "describe": { @@ -1561,37 +1801,42 @@ }, { "ordinal": 1, - "name": "md5", + "name": "board", "type_info": "Text" }, { "ordinal": 2, - "name": "md5_base32", - "type_info": "Text" + "name": "no", + "type_info": "Int8" }, { "ordinal": 3, - "name": "board", + "name": "url", "type_info": "Text" }, { "ordinal": 4, - "name": "url", + "name": "thumbnail_url", "type_info": "Text" }, { "ordinal": 5, - "name": "thumbnail_url", + "name": "ext", "type_info": "Text" }, { "ordinal": 6, - "name": "filename", - "type_info": "Text" + "name": "page", + "type_info": "Int4" }, { "ordinal": 7, - "name": "thumbnail_filename", + "name": "file_sha256", + "type_info": "Text" + }, + { + "ordinal": 8, + "name": "thumbnail_sha256", "type_info": "Text" } ], @@ -1608,6 +1853,7 @@ false, false, false, + false, false ] } diff --git a/src/api.rs b/src/api.rs index b00c7d5..a0dca7e 100644 --- a/src/api.rs +++ b/src/api.rs @@ -7,7 +7,7 @@ use tokio::fs::create_dir_all; use crate::db::DBClient; use crate::frontend::{thread_page, index_page, build_handlebars, dist}; -use crate::util::{base64_to_32, get_image_folder}; +use crate::util::{get_file_folder}; use crate::models::{IndexPage, BoardsStatus}; #[get("/{board}/thread/{no}.json")] @@ -70,18 +70,18 @@ async fn get_boards_status(db: web::Data) -> Result, board: String, tim: i64, ext: String, is_thumb: bool)-> Result { - let md5_base64 = db.image_tim_to_md5(&board, tim).await + let sha256_base32 = db.image_tim_to_sha256(&board, tim, is_thumb).await .map_err(|e| { error!("Error getting image from DB: {}", e); HttpResponse::InternalServerError().finish() })? .ok_or(HttpResponse::NotFound().finish())?; - - let md5_base32 = base64_to_32(md5_base64.clone()).unwrap(); - let path = match is_thumb { - true => get_image_folder(&md5_base64, true).join(format!("{}.jpg", md5_base32)), - false => get_image_folder(&md5_base64, false).join(format!("{}.{}", md5_base32, ext)) + + let filename = match is_thumb { + true => format!("{}.jpg", sha256_base32), + false => format!("{}.{}", sha256_base32, ext) }; + let path = get_file_folder(&sha256_base32, is_thumb).join(filename); NamedFile::open(path).map_err(|e| { error!("Error getting image from filesystem: {}", e); HttpResponse::NotFound().finish() diff --git a/src/db.rs b/src/db.rs index 8477269..850dd58 100644 --- a/src/db.rs +++ b/src/db.rs @@ -89,16 +89,12 @@ impl DBClient { .rows_affected(); Ok(res) } - pub async fn get_image_job(&self, board: &String, md5: &String) -> anyhow::Result> { + pub async fn get_image_job(&self, job_id: i64) -> anyhow::Result> { let job = sqlx::query_as!(ImageJob, " - SELECT * - FROM image_backlog - WHERE board = $1 - AND md5 = $2 + SELECT * FROM image_backlog WHERE id = $1 ", - board, - md5 + job_id, ).fetch_optional(&self.pool) .await?; Ok(job) @@ -106,22 +102,29 @@ impl DBClient { pub async fn insert_image_job(&self, img: &ImageInfo) -> anyhow::Result { let job = sqlx::query_as!(ImageJob, " - INSERT INTO image_backlog (md5, md5_base32, board, url, thumbnail_url, filename, thumbnail_filename) + INSERT INTO image_backlog ( + board, -- 1 + no, -- 2 + url, -- 3 + thumbnail_url, -- 4 + ext, -- 5 + page, -- 6 + file_sha256, -- 7 + thumbnail_sha256 -- 8 + ) VALUES - ($1, $2, $3, $4, $5, $6, $7) - ON CONFLICT(board, md5) DO - UPDATE SET - url = $4, - thumbnail_url = $5 + ($1, $2, $3, $4, $5, $6, $7, $8) + ON CONFLICT(board, no) DO NOTHING RETURNING *; ", - img.md5, - img.md5_base32, - img.board, - img.url, - img.thumbnail_url, - img.filename, - img.thumbnail_filename, + img.board, //1 + img.no, //2 + img.url, //3 + img.thumbnail_url, //4 + img.ext, //5 + img.page, //6 + img.file_sha256, //7 + img.thumbnail_sha256 //8 ).fetch_one(&self.pool) .await?; Ok(job) @@ -174,8 +177,8 @@ impl DBClient { .await?; Ok(Some(job)) } - pub async fn image_tim_to_md5(&self, board: &String, image_tim: i64) -> anyhow::Result> { - let post = sqlx::query_as!(Post, + pub async fn image_tim_to_sha256(&self, board: &String, image_tim: i64, thumb: bool) -> anyhow::Result> { + let post_opt = sqlx::query_as!(Post, " SELECT * FROM posts @@ -186,55 +189,15 @@ impl DBClient { image_tim ).fetch_optional(&self.pool) .await?; - Ok(post.map(|p|p.md5)) - } - pub async fn insert_image(&self, img: &Image) -> anyhow::Result { - let image = sqlx::query_as!(Image, - " - INSERT INTO images (md5, md5_base32, thumbnail, full_image) - VALUES ($1, $2, $3, $4) - ON CONFLICT(md5) DO - UPDATE SET - thumbnail = $3, - full_image = $4 - WHERE images.md5 = $1 - RETURNING *; - ", - img.md5, - img.md5_base32, - img.thumbnail, - img.full_image - ).fetch_one(&self.pool) - .await?; - - Ok(image) - } - pub async fn delete_image(&self, md5: &String) -> anyhow::Result { - let res: u64 = sqlx::query!( - " - DELETE FROM images WHERE md5 = $1 - ", - md5, - ).execute(&self.pool) - .await? - .rows_affected(); - Ok(res) - } - pub async fn image_exists(&self, md5: &String) -> anyhow::Result<(bool, bool)> { - let image_opt = sqlx::query_as!(Image, - " - SELECT * - FROM images - WHERE md5 = $1 - ", - md5, - ) - .fetch_optional(&self.pool) - .await?; - match image_opt { - Some(img) => Ok((img.thumbnail, img. full_image)), - None => Ok((false, false)) + if let Some(post) = post_opt { + if thumb && !post.thumbnail_sha256.is_empty() { + return Ok(Some(post.thumbnail_sha256)) + } + if !thumb && !post.file_sha256.is_empty() { + return Ok(Some(post.file_sha256)) + } } + Ok(None) } pub async fn get_post(&self, board: &String, post_no: i64) -> anyhow::Result> { let post = sqlx::query_as!(Post, @@ -306,6 +269,24 @@ impl DBClient { } Ok(Some(Thread{posts})) } + pub async fn set_post_files(&self, board: &String, no: i64, file_sha256: &String, thumbnail_sha256: &String) -> anyhow::Result> { + let post = sqlx::query_as!(Post, + " + UPDATE posts + SET + file_sha256 = $1, + thumbnail_sha256 = $2 + WHERE board = $3 AND no = $4 + RETURNING * + ", + file_sha256, + thumbnail_sha256, + board, + no + ).fetch_optional(&self.pool) + .await?; + Ok(post) + } pub async fn insert_posts(&self, entries: &Vec) -> anyhow::Result> { let mut posts = Vec::new(); for entry in entries { @@ -350,11 +331,13 @@ impl DBClient { m_img, -- 36 archived, -- 37 archived_on, -- 38 - last_modified -- 39 + last_modified, -- 39 + file_sha256, -- 40 + thumbnail_sha256 -- 41 ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, - $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39) + $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41) ON CONFLICT (board, no) DO UPDATE SET @@ -411,7 +394,9 @@ impl DBClient { entry.m_img, //36 entry.archived, //37 entry.archived_on, //38 - entry.last_modified //39 + entry.last_modified, //39 + entry.file_sha256, //40, + entry.thumbnail_sha256 //41 ) .fetch_one(&self.pool) .await?; diff --git a/src/frontend.rs b/src/frontend.rs index ae5b7ab..5b7660d 100644 --- a/src/frontend.rs +++ b/src/frontend.rs @@ -10,7 +10,7 @@ use handlebars::{Handlebars, RenderContext, Helper, Context, JsonRender, HelperR use handlebars::handlebars_helper; use handlebars_misc_helpers::register; -use crate::util::{shorten_string, string_to_idcolor,base64_to_32, get_image_url}; +use crate::util::{shorten_string, string_to_idcolor,base64_to_32, get_file_url}; use crate::db::DBClient; use crate::models::{IndexThread, Post, IndexPost, Board}; @@ -136,12 +136,17 @@ pub(crate) fn build_handlebars() -> Handlebars<'static> { out.write(base64_to_32(b64_text).unwrap_or_default().as_ref())?; Ok(()) })); - handlebars.register_helper("get_image_url", + handlebars.register_helper("get_file_url", Box::new(|h: &Helper, _r: &Handlebars, _: &Context, _rc: &mut RenderContext, out: &mut dyn Output| -> HelperResult { - let b64_text = h.param(0).ok_or(RenderError::new("base64 not found"))?.value().render(); - let is_thumb = h.param(1).ok_or(RenderError::new("Boolan not found"))?; - let is_thumb_bool = is_thumb.value().as_bool().unwrap_or_default(); - out.write(get_image_url(&b64_text, is_thumb_bool).as_ref())?; + let sha256 = h.param(0).ok_or(RenderError::new("sha256 not found"))?.value().render(); + let ext = h.param(1).ok_or(RenderError::new("ext not found"))?.value().render(); + out.write(get_file_url(&sha256, &ext, false).as_ref())?; + Ok(()) + })); + handlebars.register_helper("get_thumbnail_url", + Box::new(|h: &Helper, _r: &Handlebars, _: &Context, _rc: &mut RenderContext, out: &mut dyn Output| -> HelperResult { + let sha256 = h.param(0).ok_or(RenderError::new("sha256 not found"))?.value().render(); + out.write(get_file_url(&sha256, &".jpg".to_string(), true).as_ref())?; Ok(()) })); handlebars diff --git a/src/http.rs b/src/http.rs index 4097fe7..2f0119b 100644 --- a/src/http.rs +++ b/src/http.rs @@ -10,7 +10,9 @@ use tokio::fs::File; use tokio::io::AsyncWriteExt; use tokio::time::Duration; use log::{info, warn, error, debug}; +use tokio::fs::create_dir_all; +use crate::util::{hash_file, get_file_folder}; async fn write_bytes_to_file(filename: &Path, file_bytes: bytes::Bytes) -> anyhow::Result<()> { Ok(File::create(filename).await?.write_all(&file_bytes).await?) @@ -90,7 +92,7 @@ impl HttpClient { Ok(obj) } - pub async fn download_file(&self, url: &String, filename: &Path) -> bool { + pub async fn _download_file(&self, url: &String, filename: &Path) -> bool { let bytes = match self.fetch_url_backoff(url, &"download".to_string()).await { Ok(b) => b, Err(msg) => { @@ -106,4 +108,24 @@ impl HttpClient { } } } + pub async fn download_file_checksum(&self, url: &String, ext: &String, is_thumb: bool) -> Option { + let bytes = match self.fetch_url_backoff(url, &"download".to_string()).await { + Ok(b) => b, + Err(msg) => { + error!("Failed to download {} Error: {}", url, msg); + return None + } + }; + let hash = hash_file(&bytes); + let folder = get_file_folder(&hash, is_thumb); + create_dir_all(&folder).await.ok(); + let filename = folder.join(hash.clone() + ext); + match write_bytes_to_file(&filename, bytes).await { + Ok(()) => Some(hash), + Err(msg) => { + error!("Could not write to file {}: {}", filename.to_str().unwrap_or_default(), msg); + None + } + } + } } \ No newline at end of file diff --git a/src/image_archiver.rs b/src/image_archiver.rs index 9299825..573c1c1 100644 --- a/src/image_archiver.rs +++ b/src/image_archiver.rs @@ -2,10 +2,8 @@ use std::time::Duration; use std::collections::HashSet; #[allow(unused_imports)] use log::{info, warn, error, debug}; -use tokio::fs::create_dir_all; -use crate::models::{Image, ImageJob}; -use crate::util::{get_image_folder}; +use crate::models::ImageJob; use crate::archiver::Archiver; impl Archiver { @@ -57,49 +55,26 @@ impl Archiver { ) } pub async fn archive_image(&self, job: &ImageJob, need_full_image: bool) -> Result<(),()> { - let thumbnail_folder = get_image_folder(&job.md5, true); - let full_folder = get_image_folder(&job.md5, false); - create_dir_all(&thumbnail_folder).await.ok(); - create_dir_all(&full_folder).await.ok(); + let mut thumbnail_sha256 = job.thumbnail_sha256.clone(); + let mut file_sha256 = job.file_sha256.clone(); - let (thumb_exists, full_exists) = self.db_client.image_exists(&job.md5).await - .map_err(|e| {error!("Failed to get image status from database: {}", e); e} ) - .unwrap_or((false, false)); - - let mut image = Image{ - md5: job.md5.clone(), - thumbnail: thumb_exists, - full_image: full_exists, - md5_base32: job.md5_base32.clone() - }; - - let thumb_success = match !thumb_exists { - true => self.http_client.download_file(&job.thumbnail_url, - &thumbnail_folder.join(&job.thumbnail_filename)).await, - false => thumb_exists - }; - - image.thumbnail = thumb_success; - - self.db_client.insert_image(&image).await - .map_err(|e| {error!("Failed to insert image {} into database: {}", job.md5, e);})?; - - info!("Processed thumbnail {} ({})", job.md5, job.thumbnail_filename); - - let full_success = match need_full_image && !full_exists { - true => self.http_client.download_file(&job.url, - &full_folder.join(&job.filename)).await, - false => full_exists - }; - - image.full_image = full_success; + if thumbnail_sha256.is_empty() { + thumbnail_sha256 = self.http_client.download_file_checksum(&job.thumbnail_url, &".jpg".to_string(), true).await + .unwrap_or(thumbnail_sha256); + info!("Processed thumbnail for /{}/{}", job.board, job.no); + self.db_client.set_post_files(&job.board, job.no, &file_sha256, &thumbnail_sha256).await + .map_err(|e| {error!("Failed to update file for post: /{}/{}: {}", job.board, job.no, e);})?; + } - self.db_client.insert_image(&image).await - .map_err(|e| {error!("Failed to insert image {} into database: {}", job.md5, e);})?; + if file_sha256.is_empty() && need_full_image { + file_sha256 = self.http_client.download_file_checksum(&job.url, &job.ext, false).await + .unwrap_or(file_sha256); + info!("Processed full image for /{}/{}", job.board, job.no); + self.db_client.set_post_files(&job.board, job.no, &file_sha256, &thumbnail_sha256).await + .map_err(|e| {error!("Failed to update file for post: /{}/{}: {}", job.board, job.no, e);})?; + } self.db_client.delete_image_job(job.id).await - .map_err(|e| {error!("Failed to delete image {} from backlog: {}", job.md5, e);})?; - - info!("Processed image {} ({}) successfully", job.md5, job.filename); + .map_err(|e| {error!("Failed to delete file job {} from backlog: {}", job.id, e);})?; Ok(()) } pub fn run_image_cycle(&self) -> tokio::task::JoinHandle<()> { diff --git a/src/models.rs b/src/models.rs index a3d29b0..ae71c70 100644 --- a/src/models.rs +++ b/src/models.rs @@ -77,7 +77,11 @@ pub struct Post { #[serde(default)] pub archived_on: i64, #[serde(default)] - pub last_modified: i64 + pub last_modified: i64, + #[serde(default)] + pub file_sha256: String, + #[serde(default)] + pub thumbnail_sha256: String } #[derive(Debug, Clone, Deserialize, Serialize, Default)] pub struct PostUpdate { @@ -148,24 +152,26 @@ pub struct ThreadJob { #[derive(Debug, Clone, Default, Deserialize, Serialize)] pub struct ImageInfo { - pub md5: String, - pub md5_base32: String, pub board: String, + pub no: i64, pub url: String, pub thumbnail_url: String, - pub filename: String, - pub thumbnail_filename: String, + pub ext: String, + pub page: i32, + pub file_sha256: String, + pub thumbnail_sha256: String } #[derive(Debug, Clone, Default, Deserialize, Serialize, Eq, PartialEq)] pub struct ImageJob { pub id: i64, - pub md5: String, - pub md5_base32: String, pub board: String, + pub no: i64, pub url: String, pub thumbnail_url: String, - pub filename: String, - pub thumbnail_filename: String, + pub ext: String, + pub page: i32, + pub file_sha256: String, + pub thumbnail_sha256: String } #[derive(Debug, Clone, Deserialize, Serialize, Eq, PartialEq)] diff --git a/src/templates/post_file.html b/src/templates/post_file.html index b4cdc27..709e04a 100644 --- a/src/templates/post_file.html +++ b/src/templates/post_file.html @@ -1,10 +1,10 @@ {{#if tim}}
- {{#unless spoiler}}{{#unless spoiler}}{{b_to_kb fsize}} KB{{else}}{{b_to_kb fsize}} KB Option { + pub fn get_post_image_info(&self, board: &String, page: i32, post: &Post) -> Option { if post.tim == 0 || post.filedeleted == 1 { return None // no image } let url = format!("https://i.4cdn.org/{}/{}{}", board, post.tim, post.ext); let thumbnail_url = format!("https://i.4cdn.org/{}/{}s.jpg", board, post.tim); - let md5_b32 = match base64_to_32(post.md5.clone()) { - Ok(b32) => b32, - Err(e) => { - error!("Error converting image to base32: {}", e); - return None - } - }; - let filename = format!("{}{}", md5_b32, post.ext); - let thumbnail_filename = format!("{}.jpg", md5_b32); - Some(ImageInfo{url, thumbnail_url, filename, thumbnail_filename, md5: post.md5.clone(), md5_base32: md5_b32, board: board.clone()}) + Some(ImageInfo{url, thumbnail_url, ext: post.ext.clone(), file_sha256: post.file_sha256.clone(), thumbnail_sha256: post.thumbnail_sha256.clone(), page, no: post.no, board: board.clone()}) } pub async fn thread_cycle(&self) -> anyhow::Result<()> { let (tx, mut rx) = tokio::sync::mpsc::channel(100); @@ -83,16 +74,17 @@ impl Archiver { let thread = thread_opt.unwrap_or_default(); let posts: Vec = thread.posts.clone().into_iter().map(|mut post|{post.board = job.board.clone(); post.last_modified = job.last_modified; post}).collect(); - let image_jobs = posts.iter().filter_map(|post| self.get_post_image_info(&job.board,post)).collect::>(); + + let inserted_posts = self.db_client.insert_posts(&posts).await + .map_err(|e| {error!("Failed to insert thread /{}/{} into database: {}", job.board, job.no, e); job.clone()})?; - self.db_client.insert_posts(&posts).await - .map_err(|e| {error!("Failed to insert thread /{}/{} into database: {}", - job.board, job.no, e); job.clone()}).ok(); + let image_jobs = inserted_posts.iter().filter_map(|post| self.get_post_image_info(&job.board, job.page, post)) + .collect::>(); for image_info in image_jobs { self.db_client.insert_image_job(&image_info).await .map_err(|e| {error!("Failed to insert image job /{}/{} into database: {}", - job.board, image_info.md5.clone(), e); job.clone()}).ok(); + job.board, image_info.no, e); job.clone()})?; } self.db_client.delete_thread_job(job.id).await .map_err(|e| {error!("Failed to delete thread /{}/{} from backlog: {}", job.board, job.no, e); job})?; diff --git a/src/util.rs b/src/util.rs index d71a893..117a2eb 100644 --- a/src/util.rs +++ b/src/util.rs @@ -2,6 +2,13 @@ use std::path::{Path, PathBuf}; use base64::decode; use base32::{Alphabet, encode}; use unicode_truncate::UnicodeTruncateStr; +use sha2::{Sha256, Digest}; + +pub fn hash_file(bytes: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(bytes); + encode(Alphabet::RFC4648{padding: false}, hasher.finalize().as_slice()) +} fn bad_hash(s: String) -> i64 { let mut msg = 0i64; @@ -43,22 +50,25 @@ pub fn base64_to_32(b64: String) -> anyhow::Result { Ok(s) } -pub fn get_image_folder(md5_b64: &String, is_thumb: bool) -> PathBuf { +pub fn get_file_folder(sha256: &String, is_thumb: bool) -> PathBuf { let data_folder_str = std::env::var("DATA_ROOT").unwrap_or("data".to_string()); let image_folder = Path::new(&data_folder_str).join("images"); let folder = match is_thumb { true => image_folder.join("thumb"), false => image_folder.join("full") }; - let md5_b32 = base64_to_32(md5_b64.to_string()).unwrap_or("99invalid_md5_placeholder".to_string()); - folder.join(&md5_b32[0..2]) + folder.join(&sha256[0..2]) } -pub fn get_image_url(md5_b64: &String, is_thumb: bool) -> String { +pub fn get_file_url(sha256: &String, ext: &String, is_thumb: bool) -> String { let folder = match is_thumb { true => "thumb", false => "full" }; - let md5_b32 = base64_to_32(md5_b64.to_string()).unwrap_or("invalid_md5_placeholder".to_string()); - format!("/img/{}/{}/{}", folder, &md5_b32[0..2], md5_b32) -} \ No newline at end of file + if sha256.len() < 2 { + return "/static/image/favicon-ws.ico".to_string(); + } + + format!("/img/{}/{}/{}{}", folder, &sha256[0..2], sha256, ext) +} +