Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
utilities
fclones
fclones-0.34.0.obscpio
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File fclones-0.34.0.obscpio of Package fclones
07070100000000000041ED000000000000000000000002653E86C200000000000000000000000000000000000000000000001900000000fclones-0.34.0/.circleci07070100000001000081A4000000000000000000000001653E86C200000A9D000000000000000000000000000000000000002400000000fclones-0.34.0/.circleci/config.ymlversion: 2 jobs: format: docker: - image: cimg/rust:1.70.0 steps: - checkout - run: name: Version information command: | rustc --version; cargo --version; rustup --version - run: name: Install formatter command: | rustup component add rustfmt - run: name: Check format command: | cargo fmt --all -- --check build: docker: - image: cimg/rust:1.70.0 environment: RUSTFLAGS: '-D warnings' steps: - checkout - run: name: Calculate dependencies command: | rustc --version >rust-version test -e Cargo.lock || cargo generate-lockfile - restore_cache: keys: - cargo-cache-v1-{{ arch }}-{{checksum "rust-version"}}-{{ checksum "Cargo.lock" }} - run: name: Build command: | cargo build --all - save_cache: key: cargo-cache-v1-{{ arch }}-{{checksum "rust-version"}}-{{ checksum "Cargo.lock" }} paths: - /usr/local/cargo/registry - target/debug/.fingerprint - target/debug/build - target/debug/deps clippy: docker: - image: cimg/rust:1.70.0 steps: - checkout - run: name: Version information command: | rustc --version; cargo --version; rustup --version - run: name: Install Clippy command: | rustup component add clippy - run: name: Run Clippy command: | cargo clippy --all -- -D warnings -D rust-2018-idioms test: docker: - image: cimg/rust:1.70.0 environment: RUSTFLAGS: '-D warnings' steps: - checkout - run: name: Calculate dependencies command: | rustc --version >rust-version test -e Cargo.lock || cargo generate-lockfile - restore_cache: keys: - cargo-cache-v1-{{ arch }}-{{checksum "rust-version"}}-{{ checksum "Cargo.lock" }} - run: name: Build command: | cargo build --all - save_cache: key: cargo-cache-v1-{{ arch }}-{{checksum "rust-version"}}-{{ checksum "Cargo.lock" }} paths: - /usr/local/cargo/registry - target/debug/.fingerprint - target/debug/build - target/debug/deps - run: name: Test command: | export RUST_BACKTRACE=1 cargo test workflows: version: 2 all-checks: jobs: - format - build - clippy - test 07070100000002000081A4000000000000000000000001653E86C200000034000000000000000000000000000000000000001A00000000fclones-0.34.0/.gitignore/target /vendor **/*.rs.bk .idea .vscode .rpm *.iml 07070100000003000081A4000000000000000000000001653E86C20000AAA9000000000000000000000000000000000000001A00000000fclones-0.34.0/Cargo.lock# This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] [[package]] name = "android-tzdata" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" [[package]] name = "android_system_properties" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" dependencies = [ "libc", ] [[package]] name = "ansi-escapes" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3c0daaaae24df5995734b689627f8fa02101bc5bbc768be3055b66a010d7af" [[package]] name = "anstream" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anstyle-parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "317b9a89c1868f5ea6ff1d9539a69f45dffc21ce321ac1fd1160dfa48c8e2140" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" dependencies = [ "windows-sys 0.48.0", ] [[package]] name = "anstyle-wincon" version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628" dependencies = [ "anstyle", "windows-sys 0.48.0", ] [[package]] name = "arrayref" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "assert_matches" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi 0.1.19", "libc", "winapi", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bincode" version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" dependencies = [ "serde", ] [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "blake3" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0231f06152bf547e9c2b5194f247cd97aacf6dcd8b15d8e5ec0663f64580da87" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", ] [[package]] name = "block-buffer" version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "bstr" version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c79ad7fb2dd38f3dabd76b09c6a5a20c038fc0213ef1e9afd30eb777f120f019" dependencies = [ "memchr", "serde", ] [[package]] name = "bumpalo" version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "byte-unit" version = "4.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da78b32057b8fdfc352504708feeba7216dcd65a2c9ab02978cbd288d1279b6c" dependencies = [ "serde", "utf8-width", ] [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytesize" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "libc", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", "serde", "windows-targets 0.48.5", ] [[package]] name = "clap" version = "4.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d04704f56c2cde07f43e8e2c154b43f216dc5c92fc98ada720177362f953b956" dependencies = [ "clap_builder", "clap_derive", ] [[package]] name = "clap_builder" version = "4.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e231faeaca65ebd1ea3c737966bf858971cd38c3849107aa3ea7de90a804e45" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", "terminal_size", ] [[package]] name = "clap_derive" version = "4.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" dependencies = [ "heck", "proc-macro2", "quote", "syn", ] [[package]] name = "clap_lex" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "console" version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8" dependencies = [ "encode_unicode", "lazy_static", "libc", "unicode-width", "windows-sys 0.45.0", ] [[package]] name = "constant_time_eq" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" [[package]] name = "core-foundation-sys" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" dependencies = [ "libc", ] [[package]] name = "crc32fast" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ "cfg-if", ] [[package]] name = "crossbeam-channel" version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-deque" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-utils" version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", ] [[package]] name = "crypto-common" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", ] [[package]] name = "csv" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" dependencies = [ "csv-core", "itoa", "ryu", "serde", ] [[package]] name = "csv-core" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" dependencies = [ "memchr", ] [[package]] name = "dashmap" version = "5.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" dependencies = [ "cfg-if", "hashbrown 0.14.2", "lock_api", "once_cell", "parking_lot_core 0.9.9", ] [[package]] name = "digest" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", ] [[package]] name = "dirs" version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" dependencies = [ "dirs-sys", ] [[package]] name = "dirs-sys" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" dependencies = [ "libc", "option-ext", "redox_users", "windows-sys 0.48.0", ] [[package]] name = "dtparse" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "458c7cfe1c34b1ef7c2c435076064711050aedabae9952a261687c500f80e839" dependencies = [ "chrono", "lazy_static", "num-traits", "rust_decimal", ] [[package]] name = "dunce" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" [[package]] name = "either" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "encode_unicode" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" [[package]] name = "equivalent" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" dependencies = [ "libc", "windows-sys 0.48.0", ] [[package]] name = "fallible-iterator" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" [[package]] name = "fastrand" version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fclones" version = "0.34.0" dependencies = [ "assert_matches", "bincode", "blake3", "byte-unit", "byteorder", "bytesize", "chrono", "clap", "console", "crossbeam-channel", "crossbeam-utils", "csv", "dashmap", "dirs", "dtparse", "dunce", "fallible-iterator", "fiemap", "file-owner", "filetime", "hex", "ignore", "indexmap 2.0.2", "itertools", "lazy-init", "lazy_static", "libc", "maplit", "metrohash", "nix", "nom", "nom-regex", "num_cpus", "priority-queue", "rand", "rayon", "reflink", "regex", "serde", "serde_json", "serde_test", "sha2", "sha3", "sled", "smallvec", "status-line", "stfu8", "sysinfo", "tempfile", "thread_local", "typed-sled", "uuid", "winapi", "winapi-util", "xattr", "xxhash-rust", ] [[package]] name = "fiemap" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "084632422a59165633e28f8436545a6694b3d659405ceb3be0a441d4bfbf25d1" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "file-owner" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36f03ecafcffaf8add486d5424ffd60a25690f9da6a026692bc6c657379fc60c" dependencies = [ "nix", ] [[package]] name = "filetime" version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if", "libc", "redox_syscall 0.3.5", "windows-sys 0.48.0", ] [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "fs2" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" dependencies = [ "libc", "winapi", ] [[package]] name = "fxhash" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ "byteorder", ] [[package]] name = "gen-test-files" version = "0.1.0" dependencies = [ "clap", "rand", ] [[package]] name = "generic-array" version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", ] [[package]] name = "getrandom" version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "globset" version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "759c97c1e17c55525b57192c06a267cda0ac5210b222d6b82189a2338fa1c13d" dependencies = [ "aho-corasick", "bstr", "fnv", "log", "regex", ] [[package]] name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "hashbrown" version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "hermit-abi" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "iana-time-zone" version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8326b86b6cff230b97d0d312a6c40a60726df3332e721f72a1b035f451663b20" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", "windows-core", ] [[package]] name = "iana-time-zone-haiku" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ "cc", ] [[package]] name = "ignore" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbe7873dab538a9a44ad79ede1faf5f30d49f9a5c883ddbab48bce81b64b7492" dependencies = [ "globset", "lazy_static", "log", "memchr", "regex", "same-file", "thread_local", "walkdir", "winapi-util", ] [[package]] name = "indexmap" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown 0.12.3", ] [[package]] name = "indexmap" version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" dependencies = [ "equivalent", "hashbrown 0.14.2", ] [[package]] name = "instant" version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", ] [[package]] name = "itertools" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" dependencies = [ "either", ] [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "js-sys" version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] [[package]] name = "keccak" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f6d5ed8676d904364de097082f4e7d240b571b67989ced0240f08b7f966f940" dependencies = [ "cpufeatures", ] [[package]] name = "lazy-init" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f40963626ac12dcaf92afc15e4c3db624858c92fd9f8ba2125eaada3ac2706f" [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "linux-raw-sys" version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "lock_api" version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", ] [[package]] name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "maplit" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" [[package]] name = "memchr" version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "memoffset" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] [[package]] name = "metrohash" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ba553cb19e2acbc54baa16faef215126243fe45e53357a3b2e9f4ebc7b0506c" [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "nix" version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" dependencies = [ "bitflags 2.4.1", "cfg-if", "libc", ] [[package]] name = "nom" version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" dependencies = [ "memchr", "minimal-lexical", ] [[package]] name = "nom-regex" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72e5c7731c4c1370b61604ed52a2475e861aac9e08dec9f23903d4ddfdc91c18" dependencies = [ "nom", "regex", ] [[package]] name = "ntapi" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" dependencies = [ "winapi", ] [[package]] name = "num-traits" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ "hermit-abi 0.3.3", "libc", ] [[package]] name = "once_cell" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "option-ext" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "parking_lot" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" dependencies = [ "instant", "lock_api", "parking_lot_core 0.8.6", ] [[package]] name = "parking_lot_core" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" dependencies = [ "cfg-if", "instant", "libc", "redox_syscall 0.2.16", "smallvec", "winapi", ] [[package]] name = "parking_lot_core" version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", "redox_syscall 0.4.1", "smallvec", "windows-targets 0.48.5", ] [[package]] name = "pin-project" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "priority-queue" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fff39edfcaec0d64e8d0da38564fad195d2d51b680940295fcc307366e101e61" dependencies = [ "autocfg", "indexmap 1.9.3", ] [[package]] name = "proc-macro2" version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "rayon" version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "redox_syscall" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "redox_syscall" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "redox_syscall" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ "bitflags 1.3.2", ] [[package]] name = "redox_users" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ "getrandom", "redox_syscall 0.2.16", "thiserror", ] [[package]] name = "reflink" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc585ec28b565b4c28977ce8363a6636cedc280351ba25a7915f6c9f37f68cbe" dependencies = [ "libc", "winapi", ] [[package]] name = "regex" version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rust_decimal" version = "1.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4c4216490d5a413bc6d10fa4742bd7d4955941d062c0ef873141d6b0e7b30fd" dependencies = [ "arrayvec", "num-traits", ] [[package]] name = "rustix" version = "0.38.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" dependencies = [ "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", "windows-sys 0.48.0", ] [[package]] name = "ryu" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "serde" version = "1.0.189" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e422a44e74ad4001bdc8eede9a4570ab52f71190e9c076d14369f38b9200537" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.189" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e48d1f918009ce3145511378cf68d613e3b3d9137d67272562080d68a2b32d5" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "serde_test" version = "1.0.176" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a2f49ace1498612d14f7e0b8245519584db8299541dfe31a06374a828d620ab" dependencies = [ "serde", ] [[package]] name = "sha2" version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", "digest", ] [[package]] name = "sha3" version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" dependencies = [ "digest", "keccak", ] [[package]] name = "sled" version = "0.34.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" dependencies = [ "crc32fast", "crossbeam-epoch", "crossbeam-utils", "fs2", "fxhash", "libc", "log", "parking_lot", ] [[package]] name = "smallvec" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" [[package]] name = "status-line" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a20cc99bbe608305546a850ec4352907279a8b8044f9c13ae58bd0a8ab46ebc1" dependencies = [ "ansi-escapes", "atty", ] [[package]] name = "stfu8" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1310970b29733b601839578f8ba24991a97057dbedc4ac0decea835474054ee7" dependencies = [ "lazy_static", "regex", ] [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "sysinfo" version = "0.29.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a18d114d420ada3a891e6bc8e96a2023402203296a47cdd65083377dad18ba5" dependencies = [ "cfg-if", "core-foundation-sys", "libc", "ntapi", "once_cell", "rayon", "winapi", ] [[package]] name = "tempfile" version = "3.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" dependencies = [ "cfg-if", "fastrand", "redox_syscall 0.3.5", "rustix", "windows-sys 0.48.0", ] [[package]] name = "terminal_size" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ "rustix", "windows-sys 0.48.0", ] [[package]] name = "thiserror" version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "thread_local" version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ "cfg-if", "once_cell", ] [[package]] name = "typed-sled" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1060f05a4450ec5b758da60951b04f225a93a62079316630e76cf25c4034500d" dependencies = [ "bincode", "pin-project", "serde", "sled", "thiserror", ] [[package]] name = "typenum" version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-width" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] name = "utf8-width" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc" dependencies = [ "getrandom", ] [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "walkdir" version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-core" version = "0.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1f8cf84f35d2db49a46868f947758c7a1138116f7fac3bc844f43ade1292e64" dependencies = [ "windows-targets 0.48.5", ] [[package]] name = "windows-sys" version = "0.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" dependencies = [ "windows-targets 0.42.2", ] [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets 0.48.5", ] [[package]] name = "windows-targets" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" dependencies = [ "windows_aarch64_gnullvm 0.42.2", "windows_aarch64_msvc 0.42.2", "windows_i686_gnu 0.42.2", "windows_i686_msvc 0.42.2", "windows_x86_64_gnu 0.42.2", "windows_x86_64_gnullvm 0.42.2", "windows_x86_64_msvc 0.42.2", ] [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm 0.48.5", "windows_aarch64_msvc 0.48.5", "windows_i686_gnu 0.48.5", "windows_i686_msvc 0.48.5", "windows_x86_64_gnu 0.48.5", "windows_x86_64_gnullvm 0.48.5", "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_i686_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_x86_64_gnu" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_msvc" version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "xattr" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4686009f71ff3e5c4dbcf1a282d0a44db3f021ba69350cd42086b3e5f1c6985" dependencies = [ "libc", ] [[package]] name = "xxhash-rust" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b" 07070100000004000081A4000000000000000000000001653E86C20000008F000000000000000000000000000000000000001A00000000fclones-0.34.0/Cargo.toml[workspace] members = ["fclones", "gen-test-files"] default-members = ["fclones"] resolver = "2" [profile.release] panic = "abort" lto = true 07070100000005000081A4000000000000000000000001653E86C200000434000000000000000000000000000000000000001700000000fclones-0.34.0/LICENSEMIT License Copyright (c) 2020 Piotr KoÅ‚aczkowski Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 070701000000060000A1FF00000000000000000000000165785B0400000011000000000000000000000000000000000000001900000000fclones-0.34.0/README.mdfclones/README.md07070100000007000041ED000000000000000000000002653E86C200000000000000000000000000000000000000000000001700000000fclones-0.34.0/fclones07070100000008000081A4000000000000000000000001653E86C200000881000000000000000000000000000000000000002200000000fclones-0.34.0/fclones/Cargo.toml[package] name = "fclones" version = "0.34.0" description = "Finds and removes duplicate files" authors = ["Piotr KoÅ‚aczkowski <pkolaczk@gmail.com>"] homepage = "https://github.com/pkolaczk/fclones" repository = "https://github.com/pkolaczk/fclones" categories = ["filesystem", "command-line-utilities"] keywords = ["duplicate", "dupe", "finder", "search", "cleaner"] license = "MIT" readme = "README.md" edition = "2021" exclude = [ ".circleci", "release.sh" ] rust-version = "1.70" [dependencies] bincode = "1.3" blake3 = { version = "1.3", optional = true } byteorder = "1.4" bytesize = "1.1" byte-unit = "4.0" chrono = { version = "0.4", default-features = false, features = ["serde", "clock", "std"] } clap = { version = "4.4", features = ["derive", "cargo", "wrap_help"] } console = "0.15" crossbeam-channel = "0.5" crossbeam-utils = "0.8" csv = "1.1" dashmap = "5.2" dirs = "5.0.1" dtparse = "2" dunce = "1.0" fallible-iterator = "0.3" filetime = "0.2" hex = "0.4" ignore = "0.4.18" indexmap = "2" itertools = "0.11" lazy-init = "0.5" lazy_static = "1.4" maplit = "1.0" metrohash = "1.0" nom = "7" nom-regex = "0.2" num_cpus = "1.13" priority-queue = "1.2" rand = "0.8" rayon = "1.5" regex = "1.5" serde_json = "1.0" serde = { version = "1.0", features = ["derive"] } sha2 = { version = "0.10", optional = true } sha3 = { version = "0.10", optional = true } sled = "0.34" smallvec = "1.8" status-line = "0.2.0" stfu8 = "0.2" sysinfo = "0.29" thread_local = "1.1" typed-sled = "0.2.0" uuid = { version = "1.1", features = ["v4"] } xxhash-rust = { version = "0.8", features = ["xxh3"], optional = true } [features] default = ["blake3", "xxhash", "sha2", "sha3"] xxhash = ["xxhash-rust"] [target.'cfg(target_os = "linux")'.dependencies] fiemap = "0.1" [target.'cfg(unix)'.dependencies] file-owner = "0.1" libc = "0.2" nix = { version = "0.27", features = [ "user", "fs", "ioctl" ] } xattr = "1" [target.'cfg(windows)'.dependencies] winapi = "0.3" winapi-util = "0.1" [target.'cfg(not(any(target_os = "linux", target_os = "android")))'.dependencies] reflink = "0.1" [dev-dependencies] assert_matches = "1.5" reflink = "0.1" serde_test = "1.0" tempfile = "3" 07070100000009000081A4000000000000000000000001653E86C200006FC7000000000000000000000000000000000000002100000000fclones-0.34.0/fclones/README.mdfclones =============================================== **Efficient duplicate file finder and remover** [![CircleCI](https://circleci.com/gh/pkolaczk/fclones.svg?style=shield)](https://circleci.com/gh/pkolaczk/fclones) [![crates.io](https://img.shields.io/crates/v/fclones.svg)](https://crates.io/crates/fclones) [![Documentation](https://docs.rs/fclones/badge.svg)](https://docs.rs/fclones) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) This is the repo for command line fclones and its core libraries. For the desktop frontend, see [fclones-gui](https://github.com/pkolaczk/fclones-gui). --- `fclones` is a command line utility that identifies groups of identical files and gets rid of the file copies you no longer need. It comes with plenty of configuration options for controlling the search scope and offers many ways of removing duplicates. For maximum flexibility, it integrates well with other Unix utilities like `find` and it speaks JSON, so you have a lot of control over the search and cleanup process. `fclones` treats your data seriously. You can inspect and modify the list of duplicate files before removing them. There is also a `--dry-run` option that can tell you exactly what changes on the file system would be made. `fclones` has been implemented in Rust with a strong focus on high performance on modern hardware. It employs several optimization techniques not present in many other programs. It adapts to the type of the hard drive, orders file operations by physical data placement on HDDs, scans directory tree in parallel and uses prefix compression of paths to reduce memory consumption when working with millions of files. It is also friendly to page-cache and does not push out your data out of cache. As a result, `fclones` easily outperforms many other popular duplicate finders by a wide margin on either SSD or HDD storage. `fclones` is available on a wide variety of operating systems, but it works best on Linux. - [Features](#features) - [Demo](#demo) - [Installation](#installation) - [Usage](#usage) - [Algorithm](#the-algorithm) - [Tuning](#tuning) - [Benchmarks](#benchmarks) ## Features * Identifying groups of identical files - finding duplicate files - finding files with more than N replicas - finding unique files - finding files with fewer than N replicas * Advanced file selection for reducing the amount of data to process - scanning multiple directory roots - can work with a list of files piped directly from standard input - recursive/non-recursive file selection - recursion depth limit - filtering names and paths by extended UNIX globs - filtering names and paths by regular expressions - filtering by min/max file size - proper handling of symlinks and hardlinks * Removing redundant data - removing, moving or replacing files with soft or hard links - removing redundant file data using native copy-on-write (reflink) support on some file systems - selecting files for removal by path or name patterns - prioritizing files to remove by creation, modification, last access time or nesting level * High performance - parallel processing capability in all I/O and CPU heavy stages - automatic tuning of parallelism and access strategy based on device type (SSD vs HDD) - low memory footprint thanks to heavily optimized path representation - variety of fast non-cryptographic and cryptographic hash functions up to 512 bits wide - doesn't push data out of the page-cache (Linux-only) - optional persistent caching of file hashes - accurate progress reporting * Variety of output formats for easy further processing of results - standard text format - groups separated by group headers with file size and hash - one path per line in a group - optional `fdupes` compatibility (no headers, no indent, groups separated by blank lines) - machine-readable formats: `CSV`, `JSON` ### Limitations Copy-on-write file data deduplication (reflink) is not supported on Windows. Some optimisations are not available on platforms other than Linux: - ordering of file accesses by physical placement - page-cache drop-behind ## Demo Let's first create some files: $ mkdir test $ cd test $ echo foo >foo1.txt $ echo foo >foo2.txt $ echo foo >foo3.txt $ echo bar >bar1.txt $ echo bar >bar2.txt Now let's identify the duplicates: $ fclones group . >dupes.txt [2021-06-05 18:21:33.358] fclones: info: Started grouping [2021-06-05 18:21:33.738] fclones: info: Scanned 7 file entries [2021-06-05 18:21:33.738] fclones: info: Found 5 (20 B) files matching selection criteria [2021-06-05 18:21:33.738] fclones: info: Found 4 (16 B) candidates after grouping by size [2021-06-05 18:21:33.738] fclones: info: Found 4 (16 B) candidates after grouping by paths and file identifiers [2021-06-05 18:21:33.739] fclones: info: Found 3 (12 B) candidates after grouping by prefix [2021-06-05 18:21:33.740] fclones: info: Found 3 (12 B) candidates after grouping by suffix [2021-06-05 18:21:33.741] fclones: info: Found 3 (12 B) redundant files $ cat dupes.txt # Report by fclones 0.12.0 # Timestamp: 2021-06-05 18:21:33.741 +0200 # Command: fclones group . # Found 2 file groups # 12 B (12 B) in 3 redundant files can be removed 7d6ebf613bf94dfd976d169ff6ae02c3, 4 B (4 B) * 2: /tmp/test/bar1.txt /tmp/test/bar2.txt 6109f093b3fd5eb1060989c990d1226f, 4 B (4 B) * 3: /tmp/test/foo1.txt /tmp/test/foo2.txt /tmp/test/foo3.txt Finally we can replace the duplicates by soft links: $ fclones link --soft <dupes.txt [2021-06-05 18:25:42.488] fclones: info: Started deduplicating [2021-06-05 18:25:42.493] fclones: info: Processed 3 files and reclaimed 12 B space $ ls -l total 12 -rw-rw-r-- 1 pkolaczk pkolaczk 4 cze 5 18:19 bar1.txt lrwxrwxrwx 1 pkolaczk pkolaczk 18 cze 5 18:25 bar2.txt -> /tmp/test/bar1.txt -rw-rw-r-- 1 pkolaczk pkolaczk 382 cze 5 18:21 dupes.txt -rw-rw-r-- 1 pkolaczk pkolaczk 4 cze 5 18:19 foo1.txt lrwxrwxrwx 1 pkolaczk pkolaczk 18 cze 5 18:25 foo2.txt -> /tmp/test/foo1.txt lrwxrwxrwx 1 pkolaczk pkolaczk 18 cze 5 18:25 foo3.txt -> /tmp/test/foo1.txt ## Installation The code has been thoroughly tested on Ubuntu Linux 21.10. Other systems like Windows or Mac OS X and other architectures may work. Help test and/or port to other platforms is welcome. Please report successes as well as failures. ### Official Packages [Snap store](https://snapcraft.io/fclones) (Linux): snap install fclones [Homebrew](https://formulae.brew.sh/formula/fclones) (macOS and Linux) brew install fclones Installation packages and binaries for some platforms are also attached directly to [Releases](https://github.com/pkolaczk/fclones/releases). ### Third-party Packages * [Arch Linux](https://aur.archlinux.org/packages/fclones/) * [Alpine Linux](https://pkgs.alpinelinux.org/package/edge/testing/x86_64/fclones) * [NixOS](https://search.nixos.org/packages?channel=unstable&show=fclones&from=0&size=50&sort=relevance&type=packages&query=fclones) ### Building from Source [Install Rust Toolchain](https://www.rust-lang.org/tools/install) and then run: cargo install fclones The build will write the binary to `.cargo/bin/fclones`. ## Usage `fclones` offers separate commands for finding and removing files. This way, you can inspect the list of found files before applying any modifications to the file system. - `group` – identifies groups of identical files and prints them to the standard output - `remove` – removes redundant files earlier identified by `group` - `link` – replaces redundant files with links (default: hard links) - `dedupe` – does not remove any files, but deduplicates file data by using native copy-on-write capabilities of the file system (reflink) ### Finding Files Find duplicate, unique, under-replicated or over-replicated files in the current directory, including subdirectories: fclones group . fclones group . --unique fclones group . --rf-under 3 fclones group . --rf-over 3 You can search in multiple directories: fclones group dir1 dir2 dir3 By default, hidden files and files matching patterns listed in `.gitignore` and `.fdignore` are ignored. To search all files, use: fclones group --no-ignore --hidden dir Limit the recursion depth: fclones group . --depth 1 # scan only files in the current dir, skip subdirs fclones group * --depth 0 # similar as above in shells that expand `*` Caution: Versions up to 0.10 did not descend into directories by default. In those old versions, add `-R` flag to enable recursive directory walking. Finding files that match across two directory trees, without matching identical files within each tree: fclones group --isolate dir1 dir2 Finding duplicate files of size at least 100 MB: fclones group . -s 100M Filter by file name or path pattern: fclones group . --name '*.jpg' '*.png' Run `fclones` on files selected by `find` (note: this is likely slower than built-in filtering): find . -name '*.c' | fclones group --stdin --depth 0 Follow symbolic links, but don't escape out of the home folder: fclones group . -L --path '/home/**' Exclude a part of the directory tree from the scan: fclones group / --exclude '/dev/**' '/proc/**' ### Removing Files To remove duplicate files, move them to a different place or replace them by links, you need to send the report produced by `fclones group` to the standard input of `fclones remove`, `fclones move` or `fclones link` command. The report format is detected automatically. Currently, `default` and `json` report formats are supported. Assuming the list of duplicates has been saved in file `dupes.txt`, the following commands would remove the redundant files: fclones link <dupes.txt # replace with hard links fclones link -s <dupes.txt # replace with symbolic links fclones move target_dir <dupes.txt # move to target_dir fclones remove <dupes.txt # remove totally If you prefer to do everything at once without storing the list of groups in a file, you can pipe: fclones group . | fclones link To select the number of files to preserve, use the `-n`/`--rf-over` option. By default, it is set to the value used when running `group` (which is 1 if it wasn't set explicitly). To leave 2 replicas in each group, run: fclones remove -n 2 <dupes.txt By default, `fclones` follows the order of files specified in the input file. It keeps the files given at the beginning of each list, and removes / replaces the files given at the end of each list. It is possible to change that order by `--priority` option, for example: fclones remove --priority newest <dupes.txt # remove the newest replicas fclones remove --priority oldest <dupes.txt # remove the oldest replicas For more priority options, see `fclones remove --help`. It is also possible to restrict removing files to only files with names or paths matching a pattern: fclones remove --name '*.jpg' <dupes.txt # remove only jpg files fclones remove --path '/trash/**' <dupes.txt # remove only files in the /trash folder If it is easier to specify a pattern for files which you do *not* want to remove, then use one of `keep` options: fclones remove --keep-name '*.mov' <dupes.txt # never remove mov files fclones remove --keep-path '/important/**' <dupes.txt # never remove files in the /important folder To make sure you're not going to remove wrong files accidentally, use `--dry-run` option. This option prints all the commands that would be executed, but it doesn't actually execute them: fclones link --soft <dupes.txt --dry-run 2>/dev/null mv /tmp/test/bar2.txt /tmp/test/bar2.txt.jkXswbsDxhqItPeOfCXsWN4d ln -s /tmp/test/bar1.txt /tmp/test/bar2.txt rm /tmp/test/bar2.txt.jkXswbsDxhqItPeOfCXsWN4d mv /tmp/test/foo2.txt /tmp/test/foo2.txt.ze1hvhNjfre618TkRGUxJNzx ln -s /tmp/test/foo1.txt /tmp/test/foo2.txt rm /tmp/test/foo2.txt.ze1hvhNjfre618TkRGUxJNzx mv /tmp/test/foo3.txt /tmp/test/foo3.txt.ttLAWO6YckczL1LXEsHfcEau ln -s /tmp/test/foo1.txt /tmp/test/foo3.txt rm /tmp/test/foo3.txt.ttLAWO6YckczL1LXEsHfcEau ### Handling links Files linked by symbolic links or hard links are not treated as duplicates. You can change this behavior by setting the following flags: * When `--isolate` is set: * links residing in different directory trees are treated as duplicates, * links residing in the same directory tree are counted as a single replica. * When `--match-links` is set, fclones treats all linked files as duplicates. Consider the following directory structure, where all files are hard links sharing the same content: dir1: - file1 - file2 dir2: - file3 - file4 Because all files are essentially the same data, they will end up in the same file group, but the actual number of replicas present in that file group will differ depending on the flags given: | Command | Number of replicas | Group reported | Files to remove | |-----------------------------------------|--------------------|------------------|---------------------| | `fclones group dir1 dir2` | 1 | No | | | `fclones group dir1 dir2 --isolate` | 2 | Yes | file3, file4 | | `fclones group dir1 dir2 --match-links` | 4 | Yes | file2, file3, file4 | #### Symbolic links The `group` command ignores symbolic links to files unless at least `--follow-links` or `--symbolic-links` flag is set. If only `--follow-links` is set, symbolic links to files are followed and resolved to their targets. If `--symbolic-links` is set, symbolic links to files are not followed, but treated as hard links and potentially reported in the output report. When both `--symbolic-links` and `--follow-links` are set, symbolic links to directories are followed, but symbolic links to files are treated as hard links. **Caution**: Using `--match-links` together with `--symbolic-links` is very dangerous. It is easy to end up deleting the only regular file you have, and to be left with a bunch of orphan symbolic links. ### Preprocessing Files Use `--transform` option to safely transform files by an external command. By default, the transformation happens on a copy of file data, to avoid accidental data loss. Note that this option may significantly slow down processing of a huge number of files, because it invokes the external program for each file. The following command will strip exif before matching duplicate jpg images: fclones group . --name '*.jpg' -i --transform 'exiv2 -d a $IN' --in-place ### Other List more options: fclones [command] -h # short help fclones [command] --help # detailed help ### Path Globbing `fclones` understands a subset of Bash Extended Globbing. The following wildcards can be used: - `?` matches any character except the directory separator - `[a-z]` matches one of the characters or character ranges given in the square brackets - `[!a-z]` matches any character that is not given in the square brackets - `*` matches any sequence of characters except the directory separator - `**` matches any sequence of characters including the directory separator - `{a,b}` matches exactly one pattern from the comma-separated patterns given inside the curly brackets - `@(a|b)` same as `{a,b}` - `?(a|b)` matches at most one occurrence of the pattern inside the brackets - `+(a|b)` matches at least occurrence of the patterns given inside the brackets - `*(a|b)` matches any number of occurrences of the patterns given inside the brackets - `\` escapes wildcards on Unix-like systems, e.g. `\?` would match `?` literally - `^` escapes wildcards on Windows, e.g. `^?` would match `?` literally #### Caution * On Unix-like systems, when using globs, one must be very careful to avoid accidental expansion of globs by the shell. In many cases having globs expanded by the shell instead of by `fclones` is not what you want. In such cases, you need to quote the globs: fclones group . --name '*.jpg' * On Windows, the default shell doesn't remove quotes before passing the arguments to the program, therefore you need to pass globs unquoted: fclones group . --name *.jpg * On Windows, the default shell doesn't support path globbing, therefore wildcard characters such as * and ? used in paths will be passed literally, and they are likely to create invalid paths. For example, the following command that searches for duplicate files in the current directory in Bash, will likely fail in the default Windows shell: fclones group * If you need path globbing, and your shell does not support it, use the builtin path globbing provided by `--name` or `--path`. ## The Algorithm Files are processed in several stages. Each stage except the last one is parallel, but the previous stage must complete fully before the next one is started. 1. Scan input files and filter files matching the selection criteria. Walk directories recursively if requested. Follow symbolic links if requested. For files that match the selection criteria, read their size. 2. Group collected files by size by storing them in a hash-map. Remove groups smaller than the desired lower-bound (default 2). 3. In each group, remove duplicate files with the same inode id. The same file could be reached through different paths when hardlinks are present. This step can be optionally skipped. 4. For each remaining file, compute a hash of a tiny block of initial data. Put files with different hashes into separate groups. Prune result groups if needed. 5. For each remaining file, compute a hash of a tiny block of data at the end of the file. Put files with different hashes into separate groups. Prune small groups if needed. 6. For each remaining file, compute a hash of the whole contents of the file. Note that for small files we might have already computed a full contents hash in step 4, therefore these files can be safely omitted. Same as in steps 4 and 5, split groups and remove the ones that are too small. 7. Write report to the stdout. Note that there is no byte-by-byte comparison of files anywhere. All available hash functions are at least 128-bit wide, and you don't need to worry about hash collisions. At 10<sup>15</sup> files, the probability of collision is 0.000000001 when using a 128-bit hash, without taking into account the requirement for the files to also match by size. ### Hashes You can select the hash function with `--hash-fn` (default: `metro`). Non-cryptographic hashes are much more efficient than cryptographic, however you probably won't see much difference unless you're reading from a fast SSD or if file data is cached. | Hash function | Hash width | Cryptographic | |-------------------------------------------------------------|------------|---------------| | [metro](http://www.jandrewrogers.com/2015/05/27/metrohash/) | 128-bit | No | | [xxhash3](https://cyan4973.github.io/xxHash/) | 128-bit | No | | [blake3](https://github.com/BLAKE3-team/BLAKE3) | 256-bit | Yes | | [sha256](https://en.wikipedia.org/wiki/SHA-2) | 256-bit | Yes | | [sha512](https://en.wikipedia.org/wiki/SHA-2) | 512-bit | Yes | | [sha3-256](https://en.wikipedia.org/wiki/SHA-3) | 256-bit | Yes | | [sha3-512](https://en.wikipedia.org/wiki/SHA-3) | 512-bit | Yes | ## Tuning This section provides hints on getting the best performance from `fclones`. ### Incremental Mode If you expect to run `fclones group` more than once on the same set of files, you might benefit from turning on the hash cache by adding the `--cache` flag: ``` fclones group --cache <dir> ``` Caching can dramatically improve grouping speed on subsequent runs of `fclones` at the expense of some additional storage space needed for the cache. Caching also allows for resuming work quickly after interruption, so it is recommended if you plan to run `fclones` on huge data sets. The cache works as follows: - Each newly computed file hash is persisted in the cache together with some metadata of the file such as its modification timestamp and length. - Whenever a file hash needs to be computed, it is first looked up in the cache. The cached hash is used if the current metadata of the file strictly matches the metadata stored in the cache. Cached hashes are not invalidated by file moves because files are identified by their internal identifiers (inode identifiers on Unix), not by path names, and moves/renames typically preserve those. Beware that caching relies on file metadata to detect changes in file contents. This might introduce some inaccuracies to the grouping process if a file modification timestamp and file length is not updated immediately whenever a file gets modified. Most file systems update the timestamps automatically on closing the file. Therefore, changed files that are held open for a long time (e.g. by database systems) might be not noticed by `fclones group` and might use stale cached values. The cache database is located in the standard cache directory of the user account. Typically, those are: * Linux: `$HOME/.cache/fclones` * macOS: `$HOME/Library/Caches/fclones` * Windows: `$HOME/AppData/Local/fclones` ### Configuring Parallelism The `--threads` parameter controls the sizes of the internal thread-pool(s). This can be used to reduce parallelism level when you don't want `fclones` to impact performance of your system too much, e.g. when you need to do some other work at the same time. We recommended reducing the parallelism level if you need to reduce memory usage. When using `fclones` up to version 0.6.x to deduplicate files of sizes of at least a few MBs each on spinning drives (HDD), it is recommended to set `--threads 1`, because accessing big files from multiple threads on HDD can be much slower than single-threaded access (YMMV, this is heavily OS-dependent, 2x-10x performance differences have been reported). Since version 0.7.0, fclones uses separate per-device thread-pools for final hashing and it will automatically tune the level of parallelism, memory buffer sizes and partial hashing sizes based on the device type. These automatic settings can be overridden with `-threads` as well. The following options can be passed to `--threads`. The more specific options override the less specific ones. - `main:<n>` – sets the size of the main thread-pool used for random I/O: directory tree scanning, file metadata fetching and in-memory sorting/hashing. These operations typically benefit from high parallelism level, even on spinning drives. Unset by default, which means the pool will be configured to use all available CPU cores. - `dev:<device>:<r>,<s>` – sets the size of the thread-pool `r` used for random I/O and `s` used for sequential I/O on the block device with the given name. The name of the device is OS-dependent. Note this is not the same as the partition name or mount point. - `ssd:<r>,<s>` – sets the sizes of the thread-pools used for I/O on solid-state drives. Unset by default. - `hdd:<r>,<s>` – sets the sizes of the thread-pools used for I/O on spinning drives. Defaults to `8,1` - `removable:<r>,<s>` – sets the size of the thread-pools used for I/O on removable devices (e.g. USB sticks). Defaults to `4,1` - `unknown:<r>,<s>` – sets the size of the thread-pools used for I/O on devices of unknown type. Sometimes the device type can't be determined e.g. if it is mounted as NAS. Defaults to `4,1` - `default:<r>,<s>` – sets the pool sizes to be used by all unset options - `<r>,<s>` - same as `default:<r>,<s>` - `<n>` - same as `default:<n>,<n>` ### Examples To limit the parallelism level for the main thread pool to 1: fclones group <paths> --threads main:1 To limit the parallelism level for all I/O access for all SSD devices: fclones group <paths> --threads ssd:1 To set the parallelism level to the number of cores for random I/O access and to 2 for sequential I/O access for `/dev/sda` block device: fclones group <paths> --threads dev:/dev/sda:0,2 Multiple `--threads` options can be given, separated by spaces: fclones group <paths> --threads main:16 ssd:4 hdd:1,1 ## Benchmarks Different duplicate finders were given a task to find duplicates in a large set of files. Before each run, the system page cache was evicted with `echo 3 > /proc/sys/vm/drop_caches`. ### SSD Benchmark - Model: Dell Precision 5520 - CPU: Intel(R) Xeon(R) CPU E3-1505M v6 @ 3.00GHz - RAM: 32 GB - Storage: local NVMe SSD 512 GB - System: Ubuntu Linux 20.10, kernel 5.8.0-53-generic - Task: 1,460,720 paths, 316 GB of data Program | Version | Language | Time | Peak Memory -------------------------------------------------------|-----------|-----------|------------------:|-------------- fclones | 0.12.1 | Rust | 0:34.59 | 266 MB [yadf](https://github.com/jRimbault/yadf) | 0.15.2 | Rust | 0:59.32 | 329 MB [czkawka](https://qarmin.github.io/czkawka/) | 3.1.0 | Rust | 2:09.00 | 1.4 GB [rmlint](https://github.com/sahib/rmlint) | 2.9.0 | C, Python | 2:28.43 | 942 MB [jdupes](https://github.com/jbruchon/jdupes) | 1.18.2 | C | 5:01.91 | 332 MB [dupe-krill](https://github.com/kornelski/dupe-krill) | 1.4.5 | Rust | 5:09.52 | 706 MB [fdupes](https://github.com/adrianlopezroche/fdupes) | 2.1.1 | C | 5:46.19 | 342 MB [rdfind](https://github.com/pauldreik/rdfind) | 1.4.1 | C++ | 5:53.07 | 496 MB [dupeguru](https://dupeguru.voltaicideas.net/) | 4.1.1 | Python | 7:49.89 | 1.4 GB [fdupes-java](https://github.com/cbismuth/fdupes-java) | 1.3.1 | Java | > 20 minutes | 4.2 GB `fdupes-java` did not finish the test. I interrupted it after 20 minutes while it was still computing MD5 in stage 2/3. Unfortunately `fdupes-java` doesn't display a useful progress bar, so it is not possible to estimate how long it would take. ### HDD Benchmark - Model: Dell Precision M4600 - CPU: Intel(R) Core(TM) i7-2760QM CPU @ 2.40GHz - RAM: 24 GB - System: Mint Linux 19.3, kernel 5.4.0-70-generic - Storage: Seagate Momentus 7200 RPM SATA drive, EXT4 filesystem - Task: 51370 paths, 2 GB data, 6811 (471 MB) duplicate files Commands used: /usr/bin/time -v fclones -R <file set root> /usr/bin/time -v jdupes -R -Q <file set root> /usr/bin/time -v fdupes -R <file set root> /usr/bin/time -v rdfind <file set root> In this benchmark, the page cache was dropped before each run. Program | Version | Language | Threads | Time | Peak Memory -------------------------------------------------------|-----------|----------|--------:|----------------:|-------------: fclones | 0.9.1 | Rust | 1 | 0:19.45 | 18.1 MB [rdfind](https://github.com/pauldreik/rdfind) | 1.3.5 | C++ | 1 | 0:33.70 | 18.5 MB [yadf](https://github.com/jRimbault/yadf) | 0.14.1 | Rust | | 1:11.69 | 22.9 MB [jdupes](https://github.com/jbruchon/jdupes) | 1.9 | C | 1 | 1:18.47 | 15.7 MB [fdupes](https://github.com/adrianlopezroche/fdupes) | 1.6.1 | C | 1 | 1:33.71 | 15.9 MB 0707010000000A000041ED000000000000000000000002653E86C200000000000000000000000000000000000000000000001B00000000fclones-0.34.0/fclones/src0707010000000B000081A4000000000000000000000001653E86C2000038E2000000000000000000000000000000000000002200000000fclones-0.34.0/fclones/src/arg.rs//! Command line argument parsing and quoting utilities. //! //! Provides lossless OsString conversions to and from String by shell-like escaping and quoting. use std::error::Error; use std::ffi::{OsStr, OsString}; use std::fmt::{Debug, Display, Formatter}; use std::mem; use itertools::Itertools; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use stfu8::DecodeError; /// Argument passed to the app #[derive(Debug, Eq, PartialEq, Clone)] pub struct Arg(OsString); impl Arg { pub fn from_escaped_string(s: &str) -> Result<Self, DecodeError> { Ok(Arg(from_stfu8(s)?)) } pub fn to_escaped_string(&self) -> String { to_stfu8(self.0.clone()) } pub fn quote(&self) -> String { quote(self.0.to_os_string()) } pub fn as_os_str(&self) -> &OsStr { self.0.as_ref() } } impl AsRef<OsStr> for Arg { fn as_ref(&self) -> &OsStr { self.0.as_os_str() } } impl From<OsString> for Arg { fn from(s: OsString) -> Self { Arg(s) } } impl From<&OsStr> for Arg { fn from(s: &OsStr) -> Self { Arg(OsString::from(s)) } } impl From<&str> for Arg { fn from(s: &str) -> Self { Arg(OsString::from(s)) } } struct ArgVisitor; impl Visitor<'_> for ArgVisitor { type Value = Arg; fn expecting(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result { formatter.write_str("an STFU encoded string") } fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> where E: serde::de::Error, { let arg = Arg::from_escaped_string(v).map_err(|e| E::custom(e.to_string()))?; Ok(arg) } } impl Serialize for Arg { fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: Serializer, { serializer.serialize_str(self.to_escaped_string().as_str()) } } impl<'de> Deserialize<'de> for Arg { fn deserialize<D>(deserializer: D) -> Result<Arg, D::Error> where D: Deserializer<'de>, { deserializer.deserialize_str(ArgVisitor) } } /// Returns a lossless string representation in [STFU8 format](https://crates.io/crates/stfu8). #[cfg(unix)] pub fn to_stfu8(s: OsString) -> String { use std::os::unix::ffi::OsStringExt; let raw_path_bytes = s.into_vec(); stfu8::encode_u8(&raw_path_bytes) } /// Returns a lossless string representation in [STFU8 format](https://crates.io/crates/stfu8). #[cfg(windows)] pub fn to_stfu8(s: OsString) -> String { use std::os::windows::ffi::OsStrExt; let raw_path_bytes: Vec<u16> = s.encode_wide().collect(); stfu8::encode_u16(&raw_path_bytes) } /// Decodes the path from the string encoded with [`to_stfu8`](OsString::to_stfu8). #[cfg(unix)] pub fn from_stfu8(encoded: &str) -> Result<OsString, DecodeError> { use std::os::unix::ffi::OsStringExt; let raw_bytes = stfu8::decode_u8(encoded)?; Ok(OsString::from_vec(raw_bytes)) } /// Decodes the path from the string encoded with [`to_stfu8`](OsString::to_stfu8). #[cfg(windows)] pub fn from_stfu8(encoded: &str) -> Result<OsString, DecodeError> { use std::os::windows::ffi::OsStringExt; let raw_bytes = stfu8::decode_u16(encoded)?; Ok(OsString::from_wide(&raw_bytes)) } const SPECIAL_CHARS: [char; 25] = [ '|', '&', ';', '<', '>', '(', ')', '{', '}', '$', '`', '\\', '\'', '"', ' ', '\t', '*', '?', '+', '[', ']', '#', 'Ëœ', '=', '%', ]; /// Escapes special characters in a string, so that it will retain its literal meaning when used as /// a part of command in Unix shell. /// /// It tries to avoid introducing any unnecessary quotes or escape characters, but specifics /// regarding quoting style are left unspecified. pub fn quote(s: OsString) -> String { let lossy = s.to_string_lossy(); if lossy .chars() .any(|c| c < '\u{20}' || c == '\u{7f}' || c == '\u{fffd}' || c == '\'') { format!("$'{}'", to_stfu8(s).replace('\'', "\\'")) } else if lossy.chars().any(|c| SPECIAL_CHARS.contains(&c)) { format!("'{lossy}'") } else { lossy.to_string() } } #[derive(Debug)] pub struct ParseError { pub msg: String, } impl ParseError { pub fn new(msg: &str) -> ParseError { ParseError { msg: msg.to_string(), } } } impl Display for ParseError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.msg) } } impl Error for ParseError {} enum State { /// Within a delimiter. Delimiter, /// After backslash, but before starting word. Backslash, /// Within an unquoted word. Unquoted, /// After backslash in an unquoted word. UnquotedBackslash, /// Within a single quoted word. SingleQuoted, /// Within a double quoted word. DoubleQuoted, /// After backslash inside a double quoted word. DoubleQuotedBackslash, /// After dollar in an unquoted word. Dollar, /// Within a quoted word preceded by a dollar sign. DollarQuoted, /// After backslash in a dollar-quoted word. DollarQuotedBackslash, /// Inside a comment. Comment, } /// Appends a character to OsString fn append(s: &mut OsString, c: char) { let mut buf = [0; 4]; let c = c.encode_utf8(&mut buf); s.push(c) } /// Splits command line into separate arguments, in much the same way Unix shell would, but without /// many of expansion the shell would perform. /// /// The split functionality is compatible with behaviour of Unix shell, but with word expansions /// limited to quote removal, and without special token recognition rules for operators. /// /// The result is exactly the same as one obtained from Unix shell as long as those unsupported /// features are not present in input: no operators, no variable assignments, no tilde expansion, /// no parameter expansion, no command substitution, no arithmetic expansion, no pathname /// expansion. /// /// In case those unsupported shell features are present, the syntax that introduce them is /// interpreted literally. /// /// # Errors /// /// When input contains unmatched quote, an error is returned. /// /// # Compatibility with other implementations /// /// It should be fully compatible with g_shell_parse_argv from GLib, except that in GLib /// it is an error not to have any words after tokenization. /// /// It is also very close to shlex.split available in Python standard library, when used in POSIX /// mode with support for comments. Though, shlex implementation diverges from POSIX, and from /// implementation contained herein in three aspects. First, it doesn't support line continuations. /// Second, inside double quotes, the backslash characters retains its special meaning as an escape /// character only when followed by \\ or \", whereas POSIX specifies that it should retain its /// special meaning when followed by: $, \`, \", \\, or a newline. Third, it treats carriage return /// as one of delimiters. /// ``` pub fn split(s: &str) -> Result<Vec<Arg>, ParseError> { // Based on shell-words crate by Tomasz MiÄ…sko // Handling of dollar quotes added by Piotr KoÅ‚aczkowski use State::*; let mut words = Vec::new(); let mut word = OsString::new(); let mut pos = 0; let mut dollar_quote_start = 0; let mut chars = s.chars(); let mut state = Delimiter; loop { let c = chars.next(); state = match state { Delimiter => match c { None => break, Some('\'') => SingleQuoted, Some('\"') => DoubleQuoted, Some('\\') => Backslash, Some('\t') | Some(' ') | Some('\n') => Delimiter, Some('$') => Dollar, Some('#') => Comment, Some(c) => { append(&mut word, c); Unquoted } }, Backslash => match c { None => { append(&mut word, '\\'); words.push(Arg(mem::replace(&mut word, OsString::new()))); break; } Some('\n') => Delimiter, Some(c) => { append(&mut word, c); Unquoted } }, Unquoted => match c { None => { words.push(Arg(mem::replace(&mut word, OsString::new()))); break; } Some('\'') => SingleQuoted, Some('\"') => DoubleQuoted, Some('\\') => UnquotedBackslash, Some('$') => Dollar, Some('\t') | Some(' ') | Some('\n') => { words.push(Arg(mem::replace(&mut word, OsString::new()))); Delimiter } Some(c) => { append(&mut word, c); Unquoted } }, UnquotedBackslash => match c { None => { append(&mut word, '\\'); words.push(Arg(mem::replace(&mut word, OsString::new()))); break; } Some('\n') => Unquoted, Some(c) => { append(&mut word, c); Unquoted } }, SingleQuoted => match c { None => return Err(ParseError::new("Unclosed single quote")), Some('\'') => Unquoted, Some(c) => { append(&mut word, c); SingleQuoted } }, DoubleQuoted => match c { None => return Err(ParseError::new("Unclosed double quote")), Some('\"') => Unquoted, Some('\\') => DoubleQuotedBackslash, Some(c) => { append(&mut word, c); DoubleQuoted } }, DoubleQuotedBackslash => match c { None => return Err(ParseError::new("Unexpected end of input")), Some('\n') => DoubleQuoted, Some(c @ '$') | Some(c @ '`') | Some(c @ '"') | Some(c @ '\\') => { append(&mut word, c); DoubleQuoted } Some(c) => { append(&mut word, '\\'); append(&mut word, c); DoubleQuoted } }, Dollar => match c { None => return Err(ParseError::new("Unexpected end of input")), Some('\'') => { dollar_quote_start = pos + 1; DollarQuoted } Some(_) => return Err(ParseError::new("Expected single quote")), }, DollarQuoted => match c { None => return Err(ParseError::new("Unclosed single quote")), Some('\\') => DollarQuotedBackslash, Some('\'') => { let quoted_slice = &s[dollar_quote_start..pos].replace("\\'", "'"); let decoded = from_stfu8(quoted_slice).map_err(|e| { ParseError::new(format!("Failed to decode STFU-8 chunk: {e}").as_str()) })?; word.push(decoded.as_os_str()); Unquoted } Some(_) => DollarQuoted, }, DollarQuotedBackslash => match c { None => return Err(ParseError::new("Unexpected end of input")), Some(_) => DollarQuoted, }, Comment => match c { None => break, Some('\n') => Delimiter, Some(_) => Comment, }, }; pos += 1; } Ok(words) } /// Joins multiple command line args into a single-line escaped representation pub fn join(args: &[Arg]) -> String { args.iter().map(|arg| arg.quote()).join(" ") } #[cfg(test)] mod test { use std::ffi::OsString; use crate::arg::{quote, split, Arg}; #[test] fn quote_no_special_chars() { assert_eq!(quote(OsString::from("abc/def_123.txt")), "abc/def_123.txt"); } #[test] fn quote_path_with_control_chars() { assert_eq!(quote(OsString::from("a\nb")), "$'a\\nb'"); assert_eq!(quote(OsString::from("a\tb")), "$'a\\tb'"); } #[test] fn quote_path_with_special_chars() { assert_eq!(quote(OsString::from("a b")), "'a b'"); assert_eq!(quote(OsString::from("a*b")), "'a*b'"); assert_eq!(quote(OsString::from("a?b")), "'a?b'"); assert_eq!(quote(OsString::from("$ab")), "'$ab'"); assert_eq!(quote(OsString::from("a(b)")), "'a(b)'"); assert_eq!(quote(OsString::from("a\\b")), "'a\\b'"); } #[test] fn quote_path_with_single_quotes() { assert_eq!(quote(OsString::from("a'b")), "$'a\\'b'"); assert_eq!(quote(OsString::from("a'b'")), "$'a\\'b\\''"); } #[test] fn split_unquoted_args() { assert_eq!( split("arg1 arg2").unwrap(), vec![Arg::from("arg1"), Arg::from("arg2")] ) } #[test] fn split_single_quoted_args() { assert_eq!( split("'arg1 with spaces' arg2").unwrap(), vec![Arg::from("arg1 with spaces"), Arg::from("arg2")] ) } #[test] fn split_doubly_quoted_args() { assert_eq!( split("\"arg1 with spaces\" arg2").unwrap(), vec![Arg::from("arg1 with spaces"), Arg::from("arg2")] ) } #[test] fn split_quotes_escaping() { assert_eq!( split("\"escaped \\\" quotes\"").unwrap(), vec![Arg::from("escaped \" quotes")] ) } #[test] fn split_escaped_single_quote() { assert_eq!( split("$'single\\'quote'").unwrap(), vec![Arg::from("single'quote")] ); } #[test] fn split_spaces_escaping() { assert_eq!( split("escaped\\ space").unwrap(), vec![Arg::from("escaped space")] ) } #[test] fn dollar_quoting() { assert_eq!( split("arg1 $'arg2-\\n\\t\\\\' arg3-$'\\x7f'").unwrap(), vec![ Arg::from("arg1"), Arg::from("arg2-\n\t\\"), Arg::from("arg3-\x7f") ] ) } } 0707010000000C000081A4000000000000000000000001653E86C200002EEA000000000000000000000000000000000000002400000000fclones-0.34.0/fclones/src/cache.rs//! Persistent caching of file hashes use crossbeam_channel::RecvTimeoutError; use std::fmt::{Display, Formatter}; use std::fs::create_dir_all; use std::sync::Arc; use std::thread; use std::thread::JoinHandle; use std::time::{Duration, UNIX_EPOCH}; use serde::{Deserialize, Serialize}; use crate::error::Error; use crate::file::{FileChunk, FileHash, FileId, FileLen, FileMetadata, FilePos}; use crate::hasher::HashFn; use crate::path::Path; #[derive(Debug, Serialize, Deserialize)] pub struct Key { file_id: FileId, chunk_pos: FilePos, chunk_len: FileLen, } impl Display for Key { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}:{}", self.file_id.device, self.file_id.inode) } } #[derive(Debug, Serialize, Deserialize)] struct CachedFileInfo { modified_timestamp_ms: u64, file_len: FileLen, data_len: FileLen, hash: FileHash, } type InnerCache = typed_sled::Tree<Key, CachedFileInfo>; const FLUSH_INTERVAL: Duration = Duration::from_millis(1000); /// Caches file hashes to avoid repeated computations in subsequent runs of fclones. /// /// Most files don't change very frequently so their hashes don't change. /// Usually it is a lot faster to retrieve the hash from an embedded database that to compute /// them from file data. pub struct HashCache { cache: Arc<InnerCache>, flusher: HashCacheFlusher, } impl HashCache { /// Opens the file hash database located in the given directory. /// If the database doesn't exist yet, creates a new one. pub fn open( database_path: &Path, transform: Option<&str>, algorithm: HashFn, ) -> Result<HashCache, Error> { create_dir_all(database_path.to_path_buf()).map_err(|e| { format!( "Count not create hash database directory {}: {}", database_path.to_escaped_string(), e ) })?; let db = sled::open(database_path.to_path_buf()).map_err(|e| { format!( "Failed to open hash database at {}: {}", database_path.to_escaped_string(), e ) })?; let tree_id = format!("hash_db:{:?}:{}", algorithm, transform.unwrap_or("<none>")); let cache = Arc::new(typed_sled::Tree::open(&db, tree_id)); let flusher = HashCacheFlusher::start(&cache); Ok(HashCache { cache, flusher }) } /// Opens the file hash database located in `fclones` subdir of user cache directory. /// If the database doesn't exist yet, creates a new one. pub fn open_default(transform: Option<&str>, algorithm: HashFn) -> Result<HashCache, Error> { let cache_dir = dirs::cache_dir().ok_or("Could not obtain user cache directory from the system.")?; let hash_db_path = cache_dir.join("fclones"); Self::open(&Path::from(hash_db_path), transform, algorithm) } /// Stores the file hash plus some file metadata in the cache. pub fn put( &self, key: &Key, file: &FileMetadata, data_len: FileLen, hash: FileHash, ) -> Result<(), Error> { let value = CachedFileInfo { modified_timestamp_ms: file .modified() .map_err(|e| format!("Unable to get file modification timestamp: {e}"))? .duration_since(UNIX_EPOCH) .unwrap_or(Duration::ZERO) .as_millis() as u64, file_len: file.len(), data_len, hash, }; self.cache .insert(key, &value) .map_err(|e| format!("Failed to write entry to cache: {e}"))?; // Check for cache flush errors. If there were errors, report them to the caller. match self.flusher.err_channel.try_recv() { Ok(err) => Err(err), Err(_) => Ok(()), } } /// Retrieves the cached hash of a file. /// /// Returns `Ok(None)` if file is not present in the cache or if its current length /// or its current modification time do not match the file length and modification time /// recorded at insertion time. pub fn get( &self, key: &Key, metadata: &FileMetadata, ) -> Result<Option<(FileLen, FileHash)>, Error> { let value = self .cache .get(key) .map_err(|e| format!("Failed to retrieve entry from cache: {e}"))?; let value = match value { Some(v) => v, None => return Ok(None), // not found in cache }; let modified = metadata .modified() .map_err(|e| format!("Unable to get file modification timestamp: {e}"))? .duration_since(UNIX_EPOCH) .unwrap_or(Duration::ZERO) .as_millis() as u64; if value.modified_timestamp_ms != modified || value.file_len != metadata.len() { Ok(None) // found in cache, but the file has changed since it was cached } else { Ok(Some((value.data_len, value.hash))) } } /// Returns the cache key for a file. /// /// Using file identifiers as cache keys instead of paths allows the user for moving or renaming /// files without losing their cached hash data. pub fn key(&self, chunk: &FileChunk<'_>, metadata: &FileMetadata) -> Result<Key, Error> { let key = Key { file_id: metadata.file_id(), chunk_pos: chunk.pos, chunk_len: chunk.len, }; Ok(key) } /// Flushes all unwritten data and closes the cache. pub fn close(self) -> Result<(), Error> { self.cache .flush() .map_err(|e| format!("Failed to flush cache: {e}"))?; Ok(()) } } /// Periodically flushes the cache in a background thread struct HashCacheFlusher { thread_handle: Option<JoinHandle<()>>, control_channel: Option<crossbeam_channel::Sender<()>>, err_channel: crossbeam_channel::Receiver<Error>, } impl HashCacheFlusher { fn start(cache: &Arc<InnerCache>) -> HashCacheFlusher { let cache = Arc::downgrade(cache); let (ctrl_tx, ctrl_rx) = crossbeam_channel::bounded::<()>(1); let (err_tx, err_rx) = crossbeam_channel::bounded(1); let thread_handle = thread::spawn(move || { while let Err(RecvTimeoutError::Timeout) = ctrl_rx.recv_timeout(FLUSH_INTERVAL) { if let Some(cache) = cache.upgrade() { if let Err(e) = cache.flush() { err_tx .send(format!("Failed to flush the hash cache: {e}").into()) .unwrap_or_default(); return; } } } }); HashCacheFlusher { thread_handle: Some(thread_handle), control_channel: Some(ctrl_tx), err_channel: err_rx, } } } impl Drop for HashCacheFlusher { fn drop(&mut self) { // Signal the flusher thread to exit: drop(self.control_channel.take()); // Wait for the flusher thread to exit: self.thread_handle.take().unwrap().join().unwrap(); } } #[cfg(test)] mod test { use std::fs::OpenOptions; use crate::cache::HashCache; use crate::file::{FileChunk, FileHash, FileLen, FileMetadata, FilePos}; use crate::hasher::HashFn; use crate::path::Path; use crate::util::test::{create_file, with_dir}; #[test] fn return_cached_hash_if_file_hasnt_changed() { with_dir("cache/return_cached_hash_if_file_hasnt_changed", |root| { let path = root.join("file"); create_file(&path); let path = Path::from(&path); let metadata = FileMetadata::new(&path).unwrap(); let chunk = FileChunk::new(&path, FilePos(0), FileLen(1000)); let cache_path = Path::from(root.join("cache")); let cache = HashCache::open(&cache_path, None, HashFn::Metro).unwrap(); let key = cache.key(&chunk, &metadata).unwrap(); let orig_hash = FileHash::from(12345); let data_len = FileLen(200); cache .put(&key, &metadata, data_len, orig_hash.clone()) .unwrap(); let cached_hash = cache.get(&key, &metadata).unwrap(); assert_eq!(cached_hash, Some((data_len, orig_hash))) }); } #[test] fn return_none_if_file_has_changed() { with_dir("cache/return_none_if_file_has_changed", |root| { let path = root.join("file"); create_file(&path); let path = Path::from(&path); let metadata = FileMetadata::new(&path).unwrap(); let chunk = FileChunk::new(&path, FilePos(0), FileLen(1000)); let cache_path = Path::from(root.join("cache")); let cache = HashCache::open(&cache_path, None, HashFn::Metro).unwrap(); let key = cache.key(&chunk, &metadata).unwrap(); cache .put(&key, &metadata, chunk.len, FileHash::from(12345)) .unwrap(); // modify the file use std::io::Write; let mut f = OpenOptions::new() .append(true) .open(path.to_path_buf()) .unwrap(); write!(f, "text").unwrap(); drop(f); let metadata = FileMetadata::new(&path).unwrap(); let cached_hash = cache.get(&key, &metadata).unwrap(); assert_eq!(cached_hash, None) }); } #[test] fn return_none_if_asked_for_a_different_chunk() { with_dir("cache/return_none_if_asked_for_a_different_chunk", |root| { let path = root.join("file"); create_file(&path); let path = Path::from(&path); let metadata = FileMetadata::new(&path).unwrap(); let chunk = FileChunk::new(&path, FilePos(0), FileLen(1000)); let cache_path = Path::from(root.join("cache")); let cache = HashCache::open(&cache_path, None, HashFn::Metro).unwrap(); let key = cache.key(&chunk, &metadata).unwrap(); cache .put(&key, &metadata, chunk.len, FileHash::from(12345)) .unwrap(); let chunk = FileChunk::new(&path, FilePos(1000), FileLen(2000)); let key = cache.key(&chunk, &metadata).unwrap(); let cached_hash = cache.get(&key, &metadata).unwrap(); assert_eq!(cached_hash, None) }); } #[test] fn return_none_if_different_transform_was_used() { with_dir( "cache/return_none_if_different_transform_was_used", |root| { let path = root.join("file"); create_file(&path); let path = Path::from(&path); let metadata = FileMetadata::new(&path).unwrap(); let chunk = FileChunk::new(&path, FilePos(0), FileLen(1000)); let cache_path = Path::from(root.join("cache")); let cache = HashCache::open(&cache_path, None, HashFn::Metro).unwrap(); let key = cache.key(&chunk, &metadata).unwrap(); let orig_hash = FileHash::from(12345); let data_len = FileLen(200); cache .put(&key, &metadata, data_len, orig_hash.clone()) .unwrap(); let cached_hash = cache.get(&key, &metadata).unwrap(); assert_eq!(cached_hash, Some((data_len, orig_hash))); drop(cache); // unlock the db so we can open another cache let cache = HashCache::open(&cache_path, Some("transform"), HashFn::Metro).unwrap(); let cached_hash = cache.get(&key, &metadata).unwrap(); assert_eq!(cached_hash, None); }, ); } } 0707010000000D000081A4000000000000000000000001653E86C200006F28000000000000000000000000000000000000002500000000fclones-0.34.0/fclones/src/config.rs//! Main program configuration. use std::collections::HashMap; use std::ffi::{OsStr, OsString}; use std::fmt::{Display, Formatter}; use std::io; use std::io::{stdin, BufRead, BufReader, ErrorKind}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use chrono::{DateTime, FixedOffset, Local}; use clap::builder::{TypedValueParser, ValueParserFactory}; use clap::{command, Arg, Error}; use crate::file::FileLen; use crate::group::FileGroupFilter; use crate::group::Replication::{Overreplicated, Underreplicated}; use crate::hasher::HashFn; use crate::path::Path; use crate::pattern::{Pattern, PatternError, PatternOpts}; use crate::selector::PathSelector; use crate::transform::Transform; #[derive(Debug, Clone, Copy, clap::ValueEnum, Default)] pub enum OutputFormat { #[default] Default, Fdupes, Csv, Json, } impl OutputFormat { pub fn variants() -> Vec<&'static str> { vec!["default", "fdupes", "csv", "json"] } } impl Display for OutputFormat { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { OutputFormat::Default => f.pad("default"), OutputFormat::Fdupes => f.pad("fdupes"), OutputFormat::Csv => f.pad("csv"), OutputFormat::Json => f.pad("json"), } } } impl FromStr for OutputFormat { type Err = String; fn from_str(s: &str) -> Result<Self, Self::Err> { match s.to_lowercase().as_str() { "default" => Ok(OutputFormat::Default), "fdupes" => Ok(OutputFormat::Fdupes), "csv" => Ok(OutputFormat::Csv), "json" => Ok(OutputFormat::Json), s => Err(format!("Unrecognized output format: {s}")), } } } /// Allows to read command line arguments as crate::path::Path. #[derive(Clone)] pub struct PathParser; impl TypedValueParser for PathParser { type Value = Path; fn parse_ref( &self, _cmd: &clap::Command, _arg: Option<&Arg>, value: &OsStr, ) -> Result<Self::Value, Error> { Ok(Path::from(value)) } } /// Allows to read command line arguments as crate::path::Path. impl ValueParserFactory for Path { type Parser = PathParser; fn value_parser() -> Self::Parser { PathParser } } /// Parses date time string, accepts wide range of human-readable formats fn parse_date_time(s: &str) -> Result<DateTime<FixedOffset>, String> { match dtparse::parse(s) { Ok((dt, Some(offset))) => Ok(DateTime::from_naive_utc_and_offset(dt, offset)), Ok((dt, None)) => { let local_offset = *Local::now().offset(); Ok(DateTime::from_naive_utc_and_offset(dt, local_offset)) } Err(e) => Err(format!("Failed to parse {s} as date: {e}")), } } /// Parses string with format: `<device>:<seq parallelism>[,<rand parallelism>]` fn parse_thread_count_option(s: &str) -> Result<(OsString, Parallelism), String> { let (key, value) = if s.contains(':') { let index = s.rfind(':').unwrap(); (&s[0..index], &s[(index + 1)..]) } else { ("default", s) }; let key = OsString::from(key); let value = value.to_string(); let mut pool_sizes = value .split(',') .map(|v| v.parse::<usize>().map_err(|e| format!("{e}: {v}"))); let random = match pool_sizes.next() { Some(v) => v?, None => return Err(String::from("Missing pool size specification")), }; let sequential = match pool_sizes.next() { Some(v) => v?, None => random, }; Ok((key, Parallelism { random, sequential })) } #[derive(Clone, Copy, Debug)] pub struct Parallelism { pub random: usize, pub sequential: usize, } // Configuration of the `group` subcommand #[derive(clap::Args, Debug, Default)] pub struct GroupConfig { /// Write the report to a file instead of the standard output #[arg(short = 'o', long, value_name = "PATH")] pub output: Option<PathBuf>, /// Set output file format #[arg( value_enum, short = 'f', long, ignore_case = true, default_value = "default" )] pub format: OutputFormat, /// Read the list of input paths from the standard input instead of the arguments. /// /// This flag is mostly useful together with Unix `find` utility. #[arg(long)] pub stdin: bool, /// Limit the recursion depth. /// /// 0 disables descending into directories. /// 1 descends into directories specified explicitly as input paths, /// but does not descend into subdirectories. #[arg(short = 'd', long, value_name = "NUMBER")] pub depth: Option<usize>, /// Include hidden files. #[arg(short = '.', long)] pub hidden: bool, /// Do not ignore files matching patterns listed in `.gitignore` and `.fdignore`. #[arg(short = 'A', long)] pub no_ignore: bool, /// Follow symbolic links. /// /// If this flag is set together with `--symbolic-links`, only links to /// directories are followed. #[arg(short = 'L', long)] pub follow_links: bool, /// Treat files reachable from multiple paths through links as duplicates. /// /// If `--symbolic-links` is not set, only hard links are matched. /// If `--symbolic-links` is set, both hard and symbolic links are matched. #[arg(short = 'H', long)] pub match_links: bool, /// Don't ignore symbolic links to files. /// /// Reports symbolic links, not their targets. /// Symbolic links are not treated as duplicates of their targets, /// unless `--match-links` is set. #[arg(short = 'S', long)] pub symbolic_links: bool, /// Don't count matching files found within the same directory argument as duplicates. #[arg(short('I'), long, conflicts_with("follow_links"))] pub isolate: bool, /// Don't match files on different filesystems or devices /// /// Does not follow symbolic links crossing filesystems or devices. /// Skips nested mount-points. #[arg(short('1'), long)] pub one_fs: bool, /// Transform each file by the specified program before matching. /// /// The value of this parameter should contain a command: the path to the program /// and optionally a list of space-separated arguments. /// /// By default, the file to process will be piped to the standard input of the program and the /// processed data will be read from the standard output. /// If the program does not support piping, but requires its input and/or output file path(s) /// to be specified in the argument list, denote these paths by `$IN` and `$OUT` special /// variables. /// /// If `$IN` is specified in the command string, the file will not be piped to the standard /// input, but copied first to a temporary location and that temporary location will be /// substituted as the value of `$IN` when launching the transform command. /// Similarly, if `$OUT` is specified in the command string, the result will not be read from /// the standard output, but fclones will expect the program to write to a named pipe /// specified by `$OUT` and will read output from there. /// /// If the program modifies the original file in-place without writing to the standard output /// nor a distinct file, use `--in-place` flag. #[arg(long, value_name("COMMAND"))] pub transform: Option<String>, /// Read the transform output from the same path as the transform input file. /// /// Set this flag if the command given to `--transform` transforms the file in-place, /// i.e. it modifies the original input file instead of writing to the standard output /// or to a new file. This flag tells fclones to read output from the original file /// after the transform command exited. #[arg(long)] pub in_place: bool, /// Don't copy the file to a temporary location before transforming, /// when `$IN` parameter is specified in the `--transform` command. /// /// If this flag is present, `$IN` will point to the original file. /// Caution: /// this option may speed up processing, but it may cause loss of data because it lets /// the transform command work directly on the original file. #[arg(long)] pub no_copy: bool, /// Search for over-replicated files with replication factor above the specified value. /// /// Specifying neither `--rf-over` nor `--rf-under` is equivalent to `--rf-over 1` which would /// report duplicate files. #[arg(short('n'), long, conflicts_with("rf_under"), value_name("COUNT"))] pub rf_over: Option<usize>, /// Search for under-replicated files with replication factor below the specified value. /// /// Specifying `--rf-under 2` will report unique files. #[arg(long, conflicts_with("rf_over"), value_name("COUNT"))] pub rf_under: Option<usize>, /// Instead of searching for duplicates, search for unique files. #[arg(long, conflicts_with_all(&["rf_over", "rf_under"]))] pub unique: bool, /// Minimum file size in bytes (inclusive). /// /// Units like KB, KiB, MB, MiB, GB, GiB are supported. #[arg(short = 's', long("min"), default_value = "1", value_name("BYTES"))] pub min_size: FileLen, /// Maximum file size in bytes (inclusive). /// /// Units like KB, KiB, MB, MiB, GB, GiB are supported. #[arg(long("max"), value_name("BYTES"))] pub max_size: Option<FileLen>, /// Maximum prefix size to check in bytes /// /// Units like KB, KiB, MB, MiB, GB, GiB are supported. /// /// Default: 16KiB for hard drives, 4KiB for SSDs #[arg(long("max-prefix-size"), value_name("BYTES"))] pub max_prefix_size: Option<FileLen>, /// Maximum suffix size to check in bytes /// /// Units like KB, KiB, MB, MiB, GB, GiB are supported. /// /// Default: 16KiB for hard drives, 4KiB for SSDs #[arg(long("max-suffix-size"), value_name("BYTES"))] pub max_suffix_size: Option<FileLen>, /// Include only file names matched fully by any of the given patterns. #[arg(long = "name", value_name("PATTERN"))] pub name_patterns: Vec<String>, /// Include only paths matched fully by any of the given patterns. #[arg(long = "path", value_name("PATTERN"))] pub path_patterns: Vec<String>, /// Ignore paths matched fully by any of the given patterns. #[arg(long = "exclude", value_name("PATTERN"))] pub exclude_patterns: Vec<String>, /// Make pattern matching case-insensitive. #[arg(short = 'i', long)] pub ignore_case: bool, /// Expect patterns as Perl compatible regular expressions instead of Unix globs. #[arg(short = 'x', long)] pub regex: bool, /// A hash function to use for computing file digests. #[arg(value_enum, long, value_name = "NAME", default_value = "metro")] pub hash_fn: HashFn, /// Skip the full contents hash step entirely. /// /// This is a *dangerous* option - there is a significantly increased risk of false positives, /// especially on smaller files or files with significant amounts of shared content that /// wouldn't be caught by prefix/suffix checks. /// /// It's recommended to use this with increased `--max-prefix-size` and `--max-suffix-size` /// options, to minimize false positives. /// /// Note: This option currently does nothing if you specify a `--transform` command #[arg(long)] pub skip_content_hash: bool, /// Enable caching of file hashes. /// /// Caching can significantly speed up subsequent runs of `fclones group` by avoiding /// recomputations of hashes of the files that haven't changed since the last scan. /// Beware though, that this option relies on file modification times recorded by the /// file system. It also increases memory and storage space consumption. #[arg(long)] pub cache: bool, /// Set the sizes of thread-pools /// /// The spec has the following format: `[<name>:]<r>[,<s>]`. /// The name can be one of: /// (1) a physical block device when prefixed with `dev:` e.g. `dev:/dev/sda`; /// (2) a type of device - `ssd`, `hdd`, `removable` or `unknown`; /// (3) a thread pool or thread pool group - `main`, `default`. /// If the name is not given, this option sets the size of the main thread pool /// and thread pools dedicated to SSD devices. /// /// The values `r` and `s` are integers denoting the sizes of the thread-pools used /// respectively for random access I/O and sequential I/O. If `s` is not given, it is /// assumed to be the same as `r`. /// /// This parameter can be used multiple times to configure multiple thread pools. #[arg( short, long, value_name = "SPEC", value_parser = parse_thread_count_option, verbatim_doc_comment)] pub threads: Vec<(OsString, Parallelism)>, /// Base directory to use when resolving relative input paths. #[arg(long, value_name = "PATH", default_value("."))] pub base_dir: Path, /// A list of input paths. /// /// Accepts files and directories. /// By default descends into directories recursively, unless a recursion depth /// limit is specified with `--depth`. #[arg(required_unless_present("stdin"))] pub paths: Vec<Path>, } impl GroupConfig { fn validate(&self) -> Result<(), String> { if self.isolate && self.paths.len() <= self.rf_over() { return Err(format!( "The --isolate flag requires that the number of input paths ({}) \ is at least as large as the replication factor lower bound ({}). \ No files would be considered duplicate, regardless of their contents.", self.paths.len(), self.rf_over() + 1, )); } if self.isolate && (self.rf_under.is_some() || self.unique) && self.paths.len() < self.rf_under() { return Err(format!( "The --isolate flag requires that the number of input paths ({}) \ is larger than the replication factor upper bound ({}). \ All files would be considered unique or under-replicated, \ regardless of their contents.", self.paths.len(), self.rf_under() - 1, )); } Ok(()) } fn compile_pattern(&self, s: &str) -> Result<Pattern, PatternError> { let pattern_opts = if self.ignore_case { PatternOpts::case_insensitive() } else { PatternOpts::default() }; if self.regex { Pattern::regex_with(s, &pattern_opts) } else { Pattern::glob_with(s, &pattern_opts) } } pub fn path_selector(&self, base_dir: &Path) -> Result<PathSelector, PatternError> { let include_names: Result<Vec<Pattern>, PatternError> = self .name_patterns .iter() .map(|p| self.compile_pattern(p)) .collect(); let include_paths: Result<Vec<Pattern>, PatternError> = self .path_patterns .iter() .map(|p| self.compile_pattern(p)) .collect(); let exclude_paths: Result<Vec<Pattern>, PatternError> = self .exclude_patterns .iter() .map(|p| self.compile_pattern(p)) .collect(); Ok(PathSelector::new(base_dir.clone()) .include_names(include_names?) .include_paths(include_paths?) .exclude_paths(exclude_paths?)) } pub fn group_filter(&self) -> FileGroupFilter { FileGroupFilter { replication: if self.unique { Underreplicated(2) } else if let Some(rf) = self.rf_under { Underreplicated(rf) } else { Overreplicated(self.rf_over()) }, root_paths: if self.isolate { self.input_paths().collect() } else { vec![] }, group_by_id: !self.match_links, } } pub fn rf_over(&self) -> usize { // don't prune small groups if: // - there is transformation defined // (distinct files can become identical after the transform) // - or we're looking for under-replicated files // - or we're looking for unique files if self.transform.is_some() || self.rf_under.is_some() || self.unique { 0 } else { self.rf_over.unwrap_or(1) } } pub fn rf_under(&self) -> usize { if self.unique { 2 } else { self.rf_under.unwrap_or(usize::MAX) } } pub fn search_type(&self) -> &'static str { if self.unique { "unique" } else if self.rf_under.is_some() { "under-replicated" } else { "redundant" } } /// Makes the base directory absolute. /// Returns error if the base directory does not exist. pub fn resolve_base_dir(&mut self) -> io::Result<&Path> { if self.base_dir.is_relative() { let curr_dir = Arc::from(Path::from(std::env::current_dir()?)); self.base_dir = curr_dir.join(&self.base_dir) } if !self.base_dir.to_path_buf().is_dir() { return Err(io::Error::new( ErrorKind::NotFound, format!("Directory not found: {}", self.base_dir.to_escaped_string()), )); } self.base_dir = self.base_dir.canonicalize(); Ok(&self.base_dir) } /// Returns an iterator over the absolute input paths. /// Input paths may be provided as arguments or from standard input. pub fn input_paths(&self) -> Box<dyn Iterator<Item = Path> + Send> { let base_dir = Arc::new(self.base_dir.clone()); if self.stdin { Box::new( BufReader::new(stdin()) .lines() .map(move |s| base_dir.resolve(Path::from(s.unwrap().as_str()))), ) } else { Box::new( self.paths .clone() .into_iter() .map(move |p| base_dir.resolve(p)), ) } } fn build_transform(&self, command: &str) -> io::Result<Transform> { let mut tr = Transform::new(command.to_string(), self.in_place)?; if self.no_copy { tr.copy = false }; Ok(tr) } /// Constructs the transform object. /// Returns None if the transform was not set pub fn transform(&self) -> Option<io::Result<Transform>> { self.transform .as_ref() .map(|command| self.build_transform(command)) } pub fn thread_pool_sizes(&self) -> HashMap<OsString, Parallelism> { let mut map = HashMap::new(); for (k, v) in self.threads.iter() { map.insert(k.clone(), *v); } map } } /// Controls which files in a group should be removed / moved / replaced by links. #[derive(Copy, Clone, Debug, clap::ValueEnum)] pub enum Priority { /// Give higher priority to the files listed higher in the input file. Top = 0, /// Give higher priority to the files listed lower in the input file. Bottom, /// Give higher priority to the files with the most recent creation time. Newest, /// Give higher priority to the files with the least recent creation time. Oldest, /// Give higher priority to the files with the most recent modification time. MostRecentlyModified, /// Give higher priority to the files with the least recent modification time. LeastRecentlyModified, /// Give higher priority to the files with the most recent access time. MostRecentlyAccessed, /// Give higher priority to the files with the least recent access time. LeastRecentlyAccessed, #[cfg(unix)] /// Give higher priority to the files with the most recent status change. MostRecentStatusChange, #[cfg(unix)] /// Give higher priority to the files with the least recent status change. LeastRecentStatusChange, /// Give higher priority to the files nested deeper in the directory tree. MostNested, /// Give higher priority to the files nested shallower in the directory tree. LeastNested, } /// Configures which files should be removed #[derive(clap::Args, Debug, Default)] #[command(disable_version_flag = true)] pub struct DedupeConfig { /// Don't perform any changes on the file-system, but writes a log of file operations /// to the standard output. #[arg(long)] pub dry_run: bool, /// Write the `dry_run` report to a file instead of the standard output. #[arg(short = 'o', long, value_name = "path")] pub output: Option<PathBuf>, /// Deduplicate only the files that were modified before the given time. /// /// If any of the files in a group was modified later, the whole group is skipped. #[arg( long, short = 'm', value_name = "timestamp", value_parser(parse_date_time) )] pub modified_before: Option<DateTime<FixedOffset>>, /// Keep at least n replicas untouched. /// /// If not given, it is assumed to be the same as the /// `--rf-over` value in the earlier `fclones group` run. #[arg( short = 'n', long, value_name = "COUNT", value_parser = clap::value_parser!(u64).range(1..) )] pub rf_over: Option<usize>, /// Restrict the set of files that can be removed or replaced by links to files /// with the name matching any given patterns. #[arg(long = "name", value_name = "PATTERN")] pub name_patterns: Vec<Pattern>, /// Restrict the set of files that can be removed or replaced by links to files /// with the path matching any given patterns. #[arg(long = "path", value_name = "PATTERN")] pub path_patterns: Vec<Pattern>, /// Set the priority for files to be removed or replaced by links. #[arg(value_enum, long, value_name = "PRIORITY")] pub priority: Vec<Priority>, /// Keep files with names matching any given patterns untouched. #[arg(long = "keep-name", value_name = "PATTERN")] pub keep_name_patterns: Vec<Pattern>, /// Keep files with paths matching any given patterns untouched. #[arg(long = "keep-path", value_name = "PATTERN")] pub keep_path_patterns: Vec<Pattern>, /// Specify a list of path prefixes. /// If non-empty, all duplicates having the same path prefix (root) are treated as one. /// This also means that the files sharing the same root can be either all /// dropped or all retained. /// /// By default, it is set to the input paths given as arguments to the earlier /// `fclones group` command, if `--isolate` option was present. #[arg(long = "isolate", value_name = "PATH")] pub isolated_roots: Vec<Path>, /// Treat files reachable from multiple paths through links as duplicates. #[arg(short = 'H', long)] pub match_links: bool, /// Don't lock files before performing an action on them. #[arg(long)] pub no_lock: bool, /// Allow the size of a file to be different than the size recorded during grouping. /// /// By default, files are checked for size to prevent accidentally removing a file /// that was modified since grouping. /// However, if `--transform` was used when grouping, the data sizes recorded in the `fclones group` /// report likely don't match the on-disk sizes of the files. Therefore, /// this flag is set automatically if `--transform` was used. #[arg(long)] pub no_check_size: bool, } #[derive(clap::Subcommand, Debug)] pub enum Command { /// Produce a list of groups of identical files. /// /// Scans the given directories recursively, computes hashes of files and groups /// files with the same hash together. /// Writes the list of groups of files to the standard output, unless the target file /// is specified. This command is safe and does not modify the filesystem. Group(GroupConfig), /// Replace redundant files with links. /// /// The list of groups earlier produced by `fclones group` should be submitted /// on the standard input. /// /// Unless `--soft` is specified, hard links are created. /// Creating hard links to files on different mount points is expected to fail. Link { #[clap(flatten)] config: DedupeConfig, /// Create soft (symbolic) links. #[arg(short, long)] soft: bool, }, /// Deduplicate file data using native filesystem deduplication capabilities. /// /// The list of groups earlier produced by `fclones group` should be submitted /// on the standard input. /// /// After successful deduplication, all file clones would still be visible as distinct files, /// but the data would be stored only once, hence taking up possibly less space than before. /// Unlike with hard links, modifying a file does not modify any of its clones. /// The result is not visible to userland applications, so repeated runs /// will find the same files again. This also applies to `fclones dedupe` itself: /// The options `--priority` and `--rf-over` do not detect earlier deduplications. /// /// This command cannot cross file system boundaries. /// Not all file systems support deduplication. /// Not all metadata is preserved on macOS. /// Unsupported on Windows. Dedupe { #[clap(flatten)] config: DedupeConfig, }, /// Remove redundant files. /// /// The list of groups earlier produced by `fclones group` should be submitted /// on the standard input. Remove(DedupeConfig), /// Move redundant files to the given directory. /// /// The list of groups earlier produced by `fclones group` should be submitted /// on the standard input. Move { #[clap(flatten)] config: DedupeConfig, /// Target directory where the redundant files should be moved to. #[arg()] target: PathBuf, }, } impl Command { pub fn validate(&self) -> Result<(), String> { match self { Command::Group(c) => c.validate(), _ => Ok(()), } } } const fn after_help() -> &'static str { "Written by Piotr KoÅ‚aczkowski and contributors.\n\ Licensed under MIT license." } /// Finds and cleans up redundant files #[derive(clap::Parser, Debug)] #[command(about, author, version, after_help = after_help(), max_term_width = 100)] pub struct Config { /// Suppress progress reporting #[arg(short('q'), long)] pub quiet: bool, /// Find files #[command(subcommand)] pub command: Command, } #[cfg(test)] mod test { use crate::config::{Command, Config}; use crate::path::Path; use assert_matches::assert_matches; use clap::Parser; use std::path::PathBuf; #[test] fn test_group_command() { let config: Config = Config::try_parse_from(vec!["fclones", "group", "dir1", "dir2"]).unwrap(); assert_matches!( config.command, Command::Group(g) if g.paths == vec![Path::from("dir1"), Path::from("dir2")]); } #[test] fn test_dedupe_command() { let config: Config = Config::try_parse_from(vec!["fclones", "dedupe"]).unwrap(); assert_matches!(config.command, Command::Dedupe { .. }); } #[test] fn test_remove_command() { let config: Config = Config::try_parse_from(vec!["fclones", "remove"]).unwrap(); assert_matches!(config.command, Command::Remove { .. }); } #[test] fn test_link_command() { let config: Config = Config::try_parse_from(vec!["fclones", "link"]).unwrap(); assert_matches!(config.command, Command::Link { .. }); } #[test] fn test_move_command() { let config: Config = Config::try_parse_from(vec!["fclones", "move", "target"]).unwrap(); assert_matches!( config.command, Command::Move { target, .. } if target == PathBuf::from("target")); } } 0707010000000E000081A4000000000000000000000001653E86C20000E823000000000000000000000000000000000000002500000000fclones-0.34.0/fclones/src/dedupe.rs//! Removing redundant files. use std::cmp::{max, min, Reverse}; use std::collections::HashMap; use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; use std::io::{ErrorKind, Write}; use std::ops::{Add, AddAssign}; use std::sync::mpsc::channel; use std::sync::Arc; use std::time::SystemTime; use std::{fmt, fs, io}; use chrono::{DateTime, FixedOffset, Local}; use priority_queue::PriorityQueue; use rand::distributions::Alphanumeric; use rand::Rng; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use crate::config::{DedupeConfig, Priority}; use crate::device::DiskDevices; use crate::file::{FileId, FileLen, FileMetadata}; use crate::group::{FileGroup, FileSubGroup}; use crate::lock::FileLock; use crate::log::{Log, LogExt}; use crate::path::Path; use crate::util::{max_result, min_result, try_sort_by_key}; use crate::{Error, TIMESTAMP_FMT}; /// Defines what to do with redundant files #[derive(Clone, Debug, PartialEq, Eq)] pub enum DedupeOp { /// Removes redundant files. Remove, /// Moves redundant files to a different dir Move(Arc<Path>), /// Replaces redundant files with soft-links (ln -s on Unix). SymbolicLink, /// Replaces redundant files with hard-links (ln on Unix). HardLink, /// Reflink redundant files (cp --reflink=always, only some filesystems). RefLink, } /// Convenience struct for holding a path to a file and its metadata together #[derive(Clone, Debug)] pub struct PathAndMetadata { pub path: Path, pub metadata: FileMetadata, } impl PathAndMetadata { pub fn new(path: Path) -> io::Result<PathAndMetadata> { let metadata = FileMetadata::new(&path).map_err(|e| { io::Error::new( e.kind(), format!("Failed to read metadata of {}: {}", path.display(), e), ) })?; Ok(PathAndMetadata { metadata, path }) } } impl AsRef<PathAndMetadata> for PathAndMetadata { fn as_ref(&self) -> &PathAndMetadata { self } } impl AsRef<Path> for PathAndMetadata { fn as_ref(&self) -> &Path { &self.path } } impl AsRef<FileId> for PathAndMetadata { fn as_ref(&self) -> &FileId { self.metadata.as_ref() } } impl From<PathAndMetadata> for Path { fn from(value: PathAndMetadata) -> Self { value.path } } impl Display for PathAndMetadata { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.pad(self.path.display().as_str()) } } /// Portable abstraction for commands used to remove duplicates #[derive(Debug)] pub enum FsCommand { Remove { file: PathAndMetadata, }, Move { source: PathAndMetadata, target: Path, use_rename: bool, // try to move the file directly by issuing fs rename command }, SoftLink { target: Arc<PathAndMetadata>, link: PathAndMetadata, }, HardLink { target: Arc<PathAndMetadata>, link: PathAndMetadata, }, RefLink { target: Arc<PathAndMetadata>, link: PathAndMetadata, }, } impl FsCommand { /// Obtains a lock to the file if lock == true. fn maybe_lock(path: &Path, lock: bool) -> io::Result<Option<FileLock>> { if lock { match FileLock::new(path) { Ok(lock) => Ok(Some(lock)), Err(e) if e.kind() == ErrorKind::Unsupported => Ok(None), Err(e) => Err(e), } } else { Ok(None) } } pub fn remove(path: &Path) -> io::Result<()> { fs::remove_file(path.to_path_buf()).map_err(|e| { io::Error::new( e.kind(), format!("Failed to remove file {}: {}", path.display(), e), ) }) } #[cfg(unix)] fn symlink_internal(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> { std::os::unix::fs::symlink(target, link) } #[cfg(windows)] fn symlink_internal(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> { std::os::windows::fs::symlink_file(target, link) } fn symlink(target: &Path, link: &Path) -> io::Result<()> { Self::symlink_internal(&target.to_path_buf(), &link.to_path_buf()).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to create symbolic link {} -> {}: {}", link.display(), target.display(), e ), ) }) } fn hardlink(target: &Path, link: &Path) -> io::Result<()> { fs::hard_link(target.to_path_buf(), link.to_path_buf()).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to create hard link {} -> {}: {}", link.display(), target.display(), e ), ) }) } fn check_can_rename(source: &Path, target: &Path) -> io::Result<()> { if target.to_path_buf().exists() { return Err(io::Error::new( ErrorKind::AlreadyExists, format!( "Cannot move {} to {}: Target already exists", source.display(), target.display() ), )); } Ok(()) } fn mkdirs(path: &Path) -> io::Result<()> { fs::create_dir_all(path.to_path_buf()).map_err(|e| { io::Error::new( e.kind(), format!("Failed to create directory {}: {}", path.display(), e), ) }) } /// Renames/moves a file from one location to another. /// If the target exists, it would be overwritten. pub fn unsafe_rename(source: &Path, target: &Path) -> io::Result<()> { fs::rename(source.to_path_buf(), target.to_path_buf()).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to rename file from {} to {}: {}", source.display(), target.display(), e ), ) }) } /// Copies a file from one location to another. /// If the target exists, it would be overwritten. fn unsafe_copy(source: &Path, target: &Path) -> io::Result<()> { fs::copy(source.to_path_buf(), target.to_path_buf()).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to copy file from {} to {}: {}", source.display(), target.display(), e ), ) })?; Ok(()) } /// Moves the file from one location to another by single `fs::rename` command. /// Fails if target exists. fn move_rename(source: &Path, target: &Path) -> io::Result<()> { Self::check_can_rename(source, target)?; Self::mkdirs(target.parent().unwrap())?; Self::unsafe_rename(source, target)?; Ok(()) } /// Moves the file by copying it first to another location and then removing the original. /// Fails if target exists. fn move_copy(source: &Path, target: &Path) -> io::Result<()> { Self::check_can_rename(source, target)?; Self::mkdirs(target.parent().unwrap())?; Self::unsafe_copy(source, target)?; Self::remove(source)?; Ok(()) } /// Returns a random temporary file name in the same directory, guaranteed to not collide with /// any other file in the same directory pub fn temp_file(path: &Path) -> Path { let mut name = path .file_name() .expect("must be a regular file with a name"); name.push("."); name.push( rand::thread_rng() .sample_iter(&Alphanumeric) .take(24) .map(char::from) .collect::<String>(), ); match path.parent() { Some(parent) => parent.join(Path::from(name)), None => Path::from(name), } } /// Safely moves the file to a different location and invokes the function. /// If the function fails, moves the file back to the original location. /// If the function succeeds, removes the file permanently. pub fn safe_remove<R>( path: &Path, f: impl FnOnce(&Path) -> io::Result<R>, log: &dyn Log, ) -> io::Result<R> { let tmp = Self::temp_file(path); Self::unsafe_rename(path, &tmp)?; let result = match f(path) { Ok(result) => result, Err(e) => { // Try to undo the move if possible if let Err(remove_err) = Self::unsafe_rename(&tmp, path) { log.warn(format!( "Failed to undo move from {} to {}: {}", &path.display(), &tmp.display(), remove_err )) } return Err(e); } }; // Cleanup the temp file. if let Err(e) = Self::remove(&tmp) { log.warn(format!( "Failed to remove temporary {}: {}", &tmp.display(), e )) } Ok(result) } /// Executes the command and returns the number of bytes reclaimed pub fn execute(&self, should_lock: bool, log: &dyn Log) -> io::Result<FileLen> { match self { FsCommand::Remove { file } => { let _ = Self::maybe_lock(&file.path, should_lock)?; Self::remove(&file.path)?; Ok(file.metadata.len()) } FsCommand::SoftLink { target, link } => { let _ = Self::maybe_lock(&link.path, should_lock)?; Self::safe_remove(&link.path, |link| Self::symlink(&target.path, link), log)?; Ok(link.metadata.len()) } FsCommand::HardLink { target, link } => { let _ = Self::maybe_lock(&link.path, should_lock)?; Self::safe_remove(&link.path, |link| Self::hardlink(&target.path, link), log)?; Ok(link.metadata.len()) } FsCommand::RefLink { target, link } => { let _ = Self::maybe_lock(&link.path, should_lock)?; crate::reflink::reflink(target, link, log)?; Ok(link.metadata.len()) } FsCommand::Move { source, target, use_rename, } => { let _ = Self::maybe_lock(&source.path, should_lock); let len = source.metadata.len(); if *use_rename && Self::move_rename(&source.path, target).is_ok() { return Ok(len); } Self::move_copy(&source.path, target)?; Ok(len) } } } /// Returns the path to be affected by running the command. /// For commands that move or remove the file, it returns the path to the file before removal. /// For commands that create a link, returns the path to file that will be replaced by the link. pub fn file_to_remove(&self) -> &Path { match self { FsCommand::Remove { file, .. } | FsCommand::SoftLink { link: file, .. } | FsCommand::HardLink { link: file, .. } | FsCommand::RefLink { link: file, .. } | FsCommand::Move { source: file, .. } => &file.path, } } /// Returns how much disk space running this command would reclaim pub fn space_to_reclaim(&self) -> FileLen { match self { FsCommand::Remove { file, .. } | FsCommand::SoftLink { link: file, .. } | FsCommand::HardLink { link: file, .. } | FsCommand::RefLink { link: file, .. } | FsCommand::Move { source: file, .. } => file.metadata.len(), } } /// Formats the command as a string that can be pasted to a Unix shell (e.g. bash) #[cfg(unix)] pub fn to_shell_str(&self) -> Vec<String> { let mut result = Vec::new(); match self { FsCommand::Remove { file, .. } => { let path = file.path.quote(); result.push(format!("rm {path}")); } FsCommand::SoftLink { target, link, .. } => { let tmp = Self::temp_file(&link.path); let target = target.path.quote(); let link = link.path.quote(); result.push(format!("mv {} {}", link, tmp.quote())); result.push(format!("ln -s {target} {link}")); result.push(format!("rm {}", tmp.quote())); } FsCommand::HardLink { target, link, .. } => { let tmp = Self::temp_file(&link.path); let target = target.path.quote(); let link = link.path.quote(); result.push(format!("mv {} {}", link, tmp.quote())); result.push(format!("ln {target} {link}")); result.push(format!("rm {}", tmp.quote())); } FsCommand::RefLink { target, link, .. } => { let tmp = Self::temp_file(&link.path); let target = target.path.quote(); let link = link.path.quote(); // Not really what happens on Linux, there the `mv` is also a reflink. result.push(format!("mv {} {}", link, tmp.quote())); if cfg!(target_os = "macos") { result.push(format!("cp -c {target} {link}")); } else { result.push(format!("cp --reflink=always {target} {link}")); }; result.push(format!("rm {}", tmp.quote())); } FsCommand::Move { source, target, use_rename, } => { let source = source.path.quote(); let target = target.quote(); if *use_rename { result.push(format!("mv {} {}", &source, &target)); } else { result.push(format!("cp {} {}", &source, &target)); result.push(format!("rm {}", &source)); } } } result } #[cfg(windows)] pub fn to_shell_str(&self) -> Vec<String> { let mut result = Vec::new(); match self { FsCommand::Remove { file, .. } => { let path = file.path.quote(); result.push(format!("del {}", path)); } FsCommand::SoftLink { target, link, .. } => { let tmp = Self::temp_file(&link.path); let target = target.path.quote(); let link = link.path.quote(); result.push(format!("move {} {}", link, tmp.quote())); result.push(format!("mklink {} {}", target, link)); result.push(format!("del {}", tmp.quote())); } FsCommand::HardLink { target, link, .. } => { let tmp = Self::temp_file(&link.path); let target = target.path.quote(); let link = link.path.quote(); result.push(format!("move {} {}", link, tmp.quote())); result.push(format!("mklink /H {} {}", target, link)); result.push(format!("del {}", tmp.quote())); } FsCommand::RefLink { target, link, .. } => { result.push(format!(":: deduplicate {} {}", link, target)); } FsCommand::Move { source, target, use_rename, } => { let source = source.path.quote(); let target = target.quote(); if *use_rename { result.push(format!("move {} {}", &source, &target)); } else { result.push(format!("copy {} {}", &source, &target)); result.push(format!("del {}", &source)); } } } result } } /// Provides information about the number of deduplicated files and reclaimed disk space #[derive(Default)] pub struct DedupeResult { pub processed_count: u64, pub reclaimed_space: FileLen, } impl Add<DedupeResult> for DedupeResult { type Output = DedupeResult; fn add(self, rhs: Self) -> Self::Output { DedupeResult { processed_count: self.processed_count + rhs.processed_count, reclaimed_space: self.reclaimed_space + rhs.reclaimed_space, } } } impl AddAssign for DedupeResult { fn add_assign(&mut self, rhs: Self) { self.processed_count += rhs.processed_count; self.reclaimed_space += rhs.reclaimed_space; } } /// Returns true if any of the files have been modified after the given timestamp. /// Also returns true if file timestamp could not be read. fn was_modified(files: &[PathAndMetadata], after: DateTime<FixedOffset>, log: &dyn Log) -> bool { let mut result = false; let after: DateTime<Local> = after.into(); for PathAndMetadata { path: p, metadata: m, .. } in files.iter() { match m.modified() { Ok(file_timestamp) => { let file_timestamp: DateTime<Local> = file_timestamp.into(); if file_timestamp > after { log.warn(format!( "File {} was updated after {} (at {})", p.display(), after.format(TIMESTAMP_FMT), file_timestamp.format(TIMESTAMP_FMT) )); result = true; } } Err(e) => { log.warn(format!( "Failed to read modification time of file {}: {}", p.display(), e )); result = true; } } } result } /// Returns true if given path matches any of the `keep` patterns fn should_keep(path: &Path, config: &DedupeConfig) -> bool { let matches_any_name = config .keep_name_patterns .iter() .any(|p| match path.file_name_cstr() { Some(name) => p.matches(name.to_string_lossy().as_ref()), None => false, }); let matches_any_path = || { config .keep_path_patterns .iter() .any(|p| p.matches_path(&path.to_path_buf())) }; matches_any_name || matches_any_path() } /// Returns true if given path matches all of the `drop` patterns. /// If there are no `drop` patterns, returns true. fn may_drop(path: &Path, config: &DedupeConfig) -> bool { let matches_any_name = || { config .name_patterns .iter() .any(|p| match path.file_name_cstr() { Some(name) => p.matches(name.to_string_lossy().as_ref()), None => false, }) }; let matches_any_path = || { config .path_patterns .iter() .any(|p| p.matches_path(&path.to_path_buf())) }; (config.name_patterns.is_empty() && config.path_patterns.is_empty()) || matches_any_name() || matches_any_path() } impl<P: AsRef<PathAndMetadata>> FileSubGroup<P> { /// Returns the time of the earliest creation of a file in the subgroup pub fn created(&self) -> Result<SystemTime, Error> { Ok(min_result(self.files.iter().map(|f| { let f = f.as_ref(); f.metadata.created().map_err(|e| { format!( "Failed to read creation time of file {}: {}", f.path.display(), e ) }) }))? .unwrap()) } /// Returns the time of the latest modification of a file in the subgroup pub fn modified(&self) -> Result<SystemTime, Error> { Ok(max_result(self.files.iter().map(|f| { let f = f.as_ref(); f.metadata.modified().map_err(|e| { format!( "Failed to read modification time of file {}: {}", f.path.display(), e ) }) }))? .unwrap()) } /// Returns the time of the latest access of a file in the subgroup pub fn accessed(&self) -> Result<SystemTime, Error> { Ok(max_result(self.files.iter().map(|f| { let f = f.as_ref(); f.metadata.accessed().map_err(|e| { format!( "Failed to read access time of file {}: {}", f.path.display(), e ) }) }))? .unwrap()) } /// Returns the time of the latest status change of a file in the subgroup #[cfg(unix)] pub fn status_changed(&self) -> Result<(i64, i64), Error> { use std::os::unix::fs::MetadataExt; Ok(self .files .iter() .map(|f| { let f = f.as_ref(); (f.metadata.ctime(), f.metadata.ctime_nsec()) }) .max() .unwrap()) } /// Returns true if any of the files in the subgroup must be kept pub fn should_keep(&self, config: &DedupeConfig) -> bool { self.files .iter() .any(|f| should_keep(&f.as_ref().path, config)) } /// Returns true if all files in the subgroup can be dropped pub fn may_drop(&self, config: &DedupeConfig) -> bool { self.files .iter() .all(|f| may_drop(&f.as_ref().path, config)) } /// Returns the number of components of the least nested path pub fn min_nesting(&self) -> usize { self.files .iter() .map(|f| f.as_ref().path.component_count()) .min() .unwrap() } /// Returns the number of components of the most nested path pub fn max_nesting(&self) -> usize { self.files .iter() .map(|f| f.as_ref().path.component_count()) .max() .unwrap() } } /// Sort files so that files with highest priority (newest, most recently updated, /// recently accessed, etc) are sorted last. /// In cases when metadata of a file cannot be accessed, an error message is pushed /// in the result vector and such file is placed at the beginning of the list. pub fn sort_by_priority<P>(files: &mut [FileSubGroup<P>], priority: &Priority) -> Vec<Error> where P: AsRef<PathAndMetadata>, { match priority { Priority::Top => { files.reverse(); vec![] } Priority::Bottom => vec![], Priority::Newest => try_sort_by_key(files, |m| m.created()), Priority::Oldest => try_sort_by_key(files, |m| m.created().map(Reverse)), Priority::MostRecentlyModified => try_sort_by_key(files, |m| m.modified()), Priority::LeastRecentlyModified => try_sort_by_key(files, |m| m.modified().map(Reverse)), Priority::MostRecentlyAccessed => try_sort_by_key(files, |m| m.accessed()), Priority::LeastRecentlyAccessed => try_sort_by_key(files, |m| m.accessed().map(Reverse)), #[cfg(unix)] Priority::MostRecentStatusChange => try_sort_by_key(files, |m| m.status_changed()), #[cfg(unix)] Priority::LeastRecentStatusChange => { try_sort_by_key(files, |m| m.status_changed().map(Reverse)) } Priority::MostNested => { files.sort_by_key(|m| m.max_nesting()); vec![] } Priority::LeastNested => { files.sort_by_key(|m| Reverse(m.min_nesting())); vec![] } } } #[derive(Debug)] pub struct PartitionedFileGroup { pub to_keep: Vec<PathAndMetadata>, pub to_drop: Vec<PathAndMetadata>, } impl PartitionedFileGroup { /// Returns the destination path where the file should be moved when the /// dedupe mode was selected to move fn move_target(target_dir: &Arc<Path>, source_path: &Path) -> Path { let root = source_path .root() .map(|p| p.to_string_lossy().replace(['/', '\\', ':'], "")); let suffix = source_path.strip_root(); match root { None => target_dir.join(suffix), Some(root) => Arc::new(target_dir.join(Path::from(root))).join(suffix), } } fn are_on_same_mount(devices: &DiskDevices, file1: &Path, file2: &Path) -> bool { let mount1 = devices.get_mount_point(file1); let mount2 = devices.get_mount_point(file2); mount1 == mount2 } /// Returns a list of commands that would remove redundant files in this group when executed. pub fn dedupe_script(mut self, strategy: &DedupeOp, devices: &DiskDevices) -> Vec<FsCommand> { if self.to_drop.is_empty() { return vec![]; } assert!( !self.to_keep.is_empty(), "No files would be left after deduplicating" ); let mut commands = Vec::new(); let retained_file = Arc::new(self.to_keep.swap_remove(0)); for dropped_file in self.to_drop { match strategy { DedupeOp::SymbolicLink => commands.push(FsCommand::SoftLink { target: retained_file.clone(), link: dropped_file, }), DedupeOp::HardLink => commands.push(FsCommand::HardLink { target: retained_file.clone(), link: dropped_file, }), DedupeOp::RefLink => commands.push(FsCommand::RefLink { target: retained_file.clone(), link: dropped_file, }), DedupeOp::Remove => commands.push(FsCommand::Remove { file: dropped_file }), DedupeOp::Move(target_dir) => { let source = dropped_file; let source_path = &source.path; let use_rename = Self::are_on_same_mount(devices, source_path, target_dir); let target = Self::move_target(target_dir, source_path); commands.push(FsCommand::Move { source, target, use_rename, }) } } } commands } } /// Attempts to retrieve the metadata of all the files in the file group. /// If metadata is inaccessible for a file, a warning is emitted to the log, and None gets returned. fn fetch_files_metadata<P>(group: FileGroup<P>, log: &dyn Log) -> Option<FileGroup<PathAndMetadata>> where P: Into<Path>, { group .try_map_all(|p| { PathAndMetadata::new(p.into()).map_err(|e| { log.warn(&e); }) }) .ok() } /// Partitions a group of files into files to keep and files that can be safely dropped /// (or linked). fn partition( group: FileGroup<PathAndMetadata>, config: &DedupeConfig, log: &dyn Log, ) -> Result<PartitionedFileGroup, Error> { let file_len = group.file_len; let file_hash = group.file_hash.clone(); let mut files = group.files; let error = |msg: &str| { Err(Error::from(format!( "Could not determine files to drop in group with hash {} and len {}: {}", file_hash, file_len.0, msg ))) }; // We don't want to remove dirs or symlinks files.retain(|m| { let is_file = m.metadata.is_file(); if !is_file { log.warn(format!( "Skipping file {}: Not a regular file", m.path.display() )); } is_file }); // If file has a different length, then we really know it has been modified. // Therefore, it does not belong to the group and we can safely skip it. if !config.no_check_size { files.retain(|m| { let len_ok = m.metadata.len() == file_len; if !len_ok { log.warn(format!( "Skipping file {} with length {} different than the group length {}", m.path.display(), m.metadata.len(), file_len.0, )); } len_ok }); } // Bail out as well if any file has been modified after `config.modified_before`. // We need to skip the whole group, because we don't know if these files are really different. if let Some(max_timestamp) = config.modified_before { if was_modified(&files, max_timestamp, log) { return error("Some files could be updated since the previous run of fclones"); } } let mut file_sub_groups = FileSubGroup::group(files, &config.isolated_roots, !config.match_links); // Sort files to remove in user selected order. // The priorities at the beginning of the argument list have precedence over // the priorities given at the end of the argument list, therefore we're applying // them in reversed order. let mut sort_errors = Vec::new(); for priority in config.priority.iter().rev() { sort_errors.extend(sort_by_priority(&mut file_sub_groups, priority)); } if !sort_errors.is_empty() { for e in sort_errors { log.warn(e); } return error("Metadata of some files could not be read."); } // Split the set of file subgroups into two sets - a set that we want to keep intact and a set // that we can remove or replace with links: let (mut to_retain, mut to_drop): (Vec<_>, Vec<_>) = file_sub_groups .into_iter() .partition(|m| m.should_keep(config) || !m.may_drop(config)); // If the set to retain is smaller than the number of files we must keep (rf), then // move some higher priority files from `to_drop` and append them to `to_retain`. let n = max(1, config.rf_over.unwrap_or(1)); let missing_count = min(to_drop.len(), n.saturating_sub(to_retain.len())); to_retain.extend(to_drop.drain(0..missing_count)); assert!(to_retain.len() >= n || to_drop.is_empty()); Ok(PartitionedFileGroup { to_keep: to_retain.into_iter().flat_map(|g| g.files).collect(), to_drop: to_drop.into_iter().flat_map(|g| g.files).collect(), }) } /// Generates a list of commands that will remove the redundant files in the groups provided /// by the `groups` iterator. /// /// Calling this is perfectly safe - the function does not perform any disk changes. /// /// This function performs extensive checks if files can be removed. /// It rejects a group of files if: /// - metadata of any files in the group cannot be read, /// - any file in the group was modified after the `modified_before` configuration property /// /// Additionally it will never emit commands to remove a file which: /// - has length that does not match the file length recorded in the group metadata /// - was matched by any of the `retain_path` or `retain_name` patterns /// - was not matched by all `drop_path` and `drop_name` patterns /// /// The commands in the list are grouped into vectors where each /// vector has its sequential id. This id allows to convert the parallel iterator into a /// sequential iterator with the same order as the groups in the input file. /// Unfortunately Rayon does not allow to convert a parallel iterator /// to a sequential iterator easily, so we need this hack with prepending ids of each group. /// /// # Parameters /// - `groups`: iterator over groups of identical files /// - `op`: what to do with duplicates /// - `config`: controls which files from each group to remove / link /// - `log`: logging target pub fn dedupe<'a, I, P>( groups: I, op: DedupeOp, config: &'a DedupeConfig, log: &'a dyn Log, ) -> impl ParallelIterator<Item = (usize, Vec<FsCommand>)> + Send + 'a where I: IntoIterator<Item = FileGroup<P>> + 'a, I::IntoIter: Send, P: Into<Path> + AsRef<Path> + fmt::Debug + Send + 'a, { let devices = DiskDevices::new(&HashMap::new()); let disallow_cross_device = op == DedupeOp::HardLink || op == DedupeOp::RefLink; groups .into_iter() .enumerate() .par_bridge() .map(move |(i, group)| { let mut commands = Vec::new(); if let Some(group) = fetch_files_metadata(group, log) { let groups = if disallow_cross_device { group.partition_by_key(|p| p.metadata.device_id()) } else { vec![group] }; for group in groups { match partition(group, config, log) { Ok(group) => commands.extend(group.dedupe_script(&op, &devices)), Err(e) => log.warn(e), } } } (i, commands) }) } /// Runs a deduplication script generated by [`dedupe`]. /// /// Calling this function is going to change the contents of the file-system. /// No safety checks are performed. /// Commands are executed in parallel, on the default Rayon thread-pool. /// On command execution failure, a warning is logged and the execution of remaining commands /// continues. /// Returns the number of files processed and the amount of disk space reclaimed. pub fn run_script<I>(script: I, should_lock: bool, log: &dyn Log) -> DedupeResult where I: IntoParallelIterator<Item = (usize, Vec<FsCommand>)>, { script .into_par_iter() .flat_map(|(_, cmd_vec)| cmd_vec) .map(|cmd| cmd.execute(should_lock, log)) .inspect(|res| { if let Err(e) = res { log.warn(e); } }) .filter_map(|res| res.ok()) .map(|len| DedupeResult { processed_count: 1, reclaimed_space: len, }) .reduce(DedupeResult::default, |a, b| a + b) } /// We need this so we can put command vectors in a priority queue. struct FsCommandGroup { index: usize, commands: Vec<FsCommand>, } impl FsCommandGroup { pub fn new(index: usize, commands: Vec<FsCommand>) -> FsCommandGroup { FsCommandGroup { index, commands } } } impl PartialEq<Self> for FsCommandGroup { fn eq(&self, other: &Self) -> bool { self.index == other.index } } impl Eq for FsCommandGroup {} impl Hash for FsCommandGroup { fn hash<H: Hasher>(&self, state: &mut H) { self.index.hash(state) } } /// Prints a script generated by [`dedupe`] to stdout. /// /// Does not perform any filesystem changes. /// Returns the number of files processed and the amount of disk space that would be /// reclaimed if all commands of the script were executed with no error. pub fn log_script( script: impl IntoParallelIterator<Item = (usize, Vec<FsCommand>)> + Send, mut out: impl Write + Send, ) -> io::Result<DedupeResult> { // Unfortunately the items may come in any order from the ParallelIterator, // and that order may change with each run, because multiple threads race to produce // the next item. However, we want to print the commands // in the same deterministic order as the groups in the input file. // That's why we first send the commands to a PriorityQueue through a channel in order to // get them all on a single thread. The queue sorts them by their sequential identifiers. // Because the identifiers are consecutive integers, we know which item should be printed next. let (tx, rx) = channel(); // Scope needed so the script iterator doesn't need to be 'static. // This way we tell the compiler the background thread we start to process the iterator // terminates before exiting this function. crossbeam_utils::thread::scope(move |s| { // Process items in parallel in a background thread, so we can read as soon as they // are produced: s.spawn(move |_| { script .into_par_iter() .for_each_with(tx, |tx, item| tx.send(item).unwrap()) }); let mut queue = PriorityQueue::new(); let mut next_group_index = 0; let mut processed_count = 0; let mut reclaimed_space = FileLen(0); while let Ok((group_index, commands)) = rx.recv() { // Push the command group we received from the iterator. // We may receive them in an incorrect order, so we push them to a PriorityQueue. queue.push( FsCommandGroup::new(group_index, commands), Reverse(group_index), // we want to get items with lowest-index first ); // Process items in the queue as soon as possible to save memory. while let Some((group, _)) = queue.peek() { // Only process the item if it is the next one we expect. // If we see an out-of-order item, we simply need to wait for more items // to be pushed. if group.index != next_group_index { break; } // We got the right item, let's pop it from the queue and log it: next_group_index += 1; let cmd_vec = queue.pop().unwrap().0.commands; for cmd in cmd_vec { processed_count += 1; reclaimed_space += cmd.space_to_reclaim(); for line in cmd.to_shell_str() { writeln!(out, "{line}")?; } } } } Ok(DedupeResult { processed_count, reclaimed_space, }) }) .unwrap() } #[cfg(test)] mod test { use std::collections::HashSet; use std::default::Default; use std::fs::{create_dir, create_dir_all}; use std::path::PathBuf; use std::str::FromStr; use std::{thread, time}; use chrono::Duration; use itertools::Itertools; use crate::config::GroupConfig; use crate::file::FileHash; use crate::group_files; use crate::log::StdLog; use crate::pattern::Pattern; use crate::util::test::{create_file, create_file_newer_than, read_file, with_dir, write_file}; use super::*; #[test] fn test_temp_file_name_generation() { let path = Path::from("/foo/bar"); let temp = FsCommand::temp_file(&path); assert_ne!(path, temp); assert_ne!( path.file_name().unwrap().len(), temp.file_name().unwrap().len() ); assert_eq!(path.parent(), temp.parent()); } #[test] fn test_remove_command_removes_file() { with_dir("dedupe/remove_cmd", |root| { let log = StdLog::new(); let file_path = root.join("file"); create_file(&file_path); let file = PathAndMetadata::new(Path::from(&file_path)).unwrap(); let cmd = FsCommand::Remove { file }; cmd.execute(true, &log).unwrap(); assert!(!file_path.exists()) }) } #[test] fn test_move_command_moves_file_by_rename() { with_dir("dedupe/move_rename_cmd", |root| { let log = StdLog::new(); let file_path = root.join("file"); let target = Path::from(root.join("target")); create_file(&file_path); let file = PathAndMetadata::new(Path::from(&file_path)).unwrap(); let cmd = FsCommand::Move { source: file, target: target.clone(), use_rename: true, }; cmd.execute(true, &log).unwrap(); assert!(!file_path.exists()); assert!(target.to_path_buf().exists()); }) } #[test] fn test_move_command_moves_file_by_copy() { with_dir("dedupe/move_copy_cmd", |root| { let log = StdLog::new(); let file_path = root.join("file"); let target = Path::from(root.join("target")); create_file(&file_path); let file = PathAndMetadata::new(Path::from(&file_path)).unwrap(); let cmd = FsCommand::Move { source: file, target: target.clone(), use_rename: false, }; cmd.execute(true, &log).unwrap(); assert!(!file_path.exists()); assert!(target.to_path_buf().exists()); }) } #[test] fn test_move_fails_if_target_exists() { with_dir("dedupe/move_target_exists", |root| { let log = StdLog::new(); let file_path = root.join("file"); let target = root.join("target"); create_file(&file_path); create_file(&target); let file = PathAndMetadata::new(Path::from(&file_path)).unwrap(); let cmd = FsCommand::Move { source: file, target: Path::from(&target), use_rename: false, }; assert!(cmd.execute(true, &log).is_err()); }) } #[test] fn test_soft_link_command_replaces_file_with_a_link() { with_dir("dedupe/soft_link_cmd", |root| { let log = StdLog::new(); let file_path_1 = root.join("file_1"); let file_path_2 = root.join("file_2"); write_file(&file_path_1, "foo"); write_file(&file_path_2, ""); let file_1 = PathAndMetadata::new(Path::from(&file_path_1)).unwrap(); let file_2 = PathAndMetadata::new(Path::from(&file_path_2)).unwrap(); let cmd = FsCommand::SoftLink { target: Arc::new(file_1), link: file_2, }; cmd.execute(true, &log).unwrap(); assert!(file_path_1.exists()); assert!(file_path_2.exists()); assert!(fs::symlink_metadata(&file_path_2) .unwrap() .file_type() .is_symlink()); assert_eq!(read_file(&file_path_2), "foo"); }) } #[test] fn test_hard_link_command_replaces_file_with_a_link() { with_dir("dedupe/hard_link_cmd", |root| { let log = StdLog::new(); let file_path_1 = root.join("file_1"); let file_path_2 = root.join("file_2"); write_file(&file_path_1, "foo"); write_file(&file_path_2, ""); let file_1 = PathAndMetadata::new(Path::from(&file_path_1)).unwrap(); let file_2 = PathAndMetadata::new(Path::from(&file_path_2)).unwrap(); let cmd = FsCommand::HardLink { target: Arc::new(file_1), link: file_2, }; cmd.execute(true, &log).unwrap(); assert!(file_path_1.exists()); assert!(file_path_2.exists()); assert_eq!(read_file(&file_path_2), "foo"); }) } /// Creates 3 empty files with different creation time and returns a FileGroup describing them fn make_group(root: &PathBuf, file_hash: FileHash) -> FileGroup<Path> { create_dir_all(root).unwrap(); let file_1 = root.join("file_1"); let file_2 = root.join("file_2"); let file_3 = root.join("file_3"); create_file(&file_1); let ctime_1 = fs::metadata(&file_1).unwrap().modified().unwrap(); let ctime_2 = create_file_newer_than(&file_2, ctime_1); create_file_newer_than(&file_3, ctime_2); FileGroup { file_len: FileLen(0), file_hash, files: vec![ Path::from(&file_1), Path::from(&file_2), Path::from(&file_3), ], } } #[test] fn test_partition_selects_files_for_removal() { with_dir("dedupe/partition/basic", |root| { let group = make_group(root, FileHash::from_str("00").unwrap()); let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let config = DedupeConfig::default(); let partitioned = partition(group, &config, &StdLog::new()).unwrap(); assert_eq!(partitioned.to_keep.len(), 1); assert_eq!(partitioned.to_drop.len(), 2); }) } #[test] fn test_partition_bails_out_if_file_modified_too_late() { with_dir("dedupe/partition/modification", |root| { let group = make_group(root, FileHash::from_str("00").unwrap()); let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let config = DedupeConfig { modified_before: Some(DateTime::from(Local::now() - Duration::days(1))), ..DedupeConfig::default() }; let partitioned = partition(group, &config, &StdLog::new()); assert!(partitioned.is_err()); }) } #[test] fn test_partition_skips_file_with_different_len() { with_dir("dedupe/partition/file_len", |root| { let group = make_group(root, FileHash::from_str("00").unwrap()); let path = group.files[0].clone(); write_file(&path.to_path_buf(), "foo"); let config = DedupeConfig { priority: vec![Priority::MostRecentlyModified], ..DedupeConfig::default() }; let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let partitioned = partition(group, &config, &StdLog::new()).unwrap(); assert!(!partitioned.to_drop.iter().any(|m| m.path == path)); assert!(!partitioned.to_keep.iter().any(|m| m.path == path)); }) } fn path_set(v: &[PathAndMetadata]) -> HashSet<&Path> { v.iter().map(|f| &f.path).collect() } #[test] fn test_partition_respects_creation_time_priority() { with_dir("dedupe/partition/ctime_priority", |root| { if fs::metadata(root).unwrap().created().is_err() { // can't run the test because the filesystem doesn't support fetching // file creation time return; } let group = make_group(root, FileHash::from_str("00").unwrap()); let mut config = DedupeConfig { priority: vec![Priority::Newest], ..DedupeConfig::default() }; let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let partitioned_1 = partition(group.clone(), &config, &StdLog::new()).unwrap(); config.priority = vec![Priority::Oldest]; let partitioned_2 = partition(group, &config, &StdLog::new()).unwrap(); assert_ne!( path_set(&partitioned_1.to_keep), path_set(&partitioned_2.to_keep) ); assert_ne!( path_set(&partitioned_1.to_drop), path_set(&partitioned_2.to_drop) ); }); } #[test] fn test_partition_respects_modification_time_priority() { with_dir("dedupe/partition/mtime_priority", |root| { let group = make_group(root, FileHash::from_str("00").unwrap()); thread::sleep(time::Duration::from_millis(10)); let path = group.files[0].clone(); write_file(&path.to_path_buf(), "foo"); // note that fetching metadata happens after we wrote the file let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let config = DedupeConfig { priority: vec![Priority::MostRecentlyModified], ..DedupeConfig::default() }; let partitioned_1 = partition(group.clone(), &config, &StdLog::new()).unwrap(); let config = DedupeConfig { priority: vec![Priority::LeastRecentlyModified], ..DedupeConfig::default() }; let partitioned_2 = partition(group, &config, &StdLog::new()).unwrap(); assert_ne!( path_set(&partitioned_1.to_keep), path_set(&partitioned_2.to_keep) ); assert_ne!( path_set(&partitioned_1.to_drop), path_set(&partitioned_2.to_drop) ); }); } #[test] fn test_partition_respects_keep_patterns() { with_dir("dedupe/partition/keep", |root| { let group = make_group(root, FileHash::from_str("00").unwrap()); let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let mut config = DedupeConfig { priority: vec![Priority::LeastRecentlyModified], keep_name_patterns: vec![Pattern::glob("*_1").unwrap()], ..DedupeConfig::default() }; let p = partition(group.clone(), &config, &StdLog::new()).unwrap(); assert_eq!(p.to_keep.len(), 1); assert_eq!(&p.to_keep[0].path, &group.files[0].path); config.keep_name_patterns = vec![]; config.keep_path_patterns = vec![Pattern::glob("**/file_1").unwrap()]; let p = partition(group.clone(), &config, &StdLog::new()).unwrap(); assert_eq!(p.to_keep.len(), 1); assert_eq!(&p.to_keep[0].path, &group.files[0].path); }) } #[test] fn test_partition_respects_drop_patterns() { with_dir("dedupe/partition/drop", |root| { let group = make_group(root, FileHash::from_str("00").unwrap()); let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let mut config = DedupeConfig { priority: vec![Priority::LeastRecentlyModified], name_patterns: vec![Pattern::glob("*_3").unwrap()], ..DedupeConfig::default() }; let p = partition(group.clone(), &config, &StdLog::new()).unwrap(); assert_eq!(p.to_drop.len(), 1); assert_eq!(&p.to_drop[0].path, &group.files[2].path); config.name_patterns = vec![]; config.path_patterns = vec![Pattern::glob("**/file_3").unwrap()]; let p = partition(group.clone(), &config, &StdLog::new()).unwrap(); assert_eq!(p.to_drop.len(), 1); assert_eq!(&p.to_drop[0].path, &group.files[2].path); }) } #[test] fn test_partition_respects_isolated_roots() { with_dir("dedupe/partition/isolated_roots", |root| { let root1 = root.join("root1"); let root2 = root.join("root2"); create_dir(&root1).unwrap(); create_dir(&root2).unwrap(); let group1 = make_group(&root1, FileHash::from_str("00").unwrap()); let group2 = make_group(&root2, FileHash::from_str("00").unwrap()); let group = FileGroup { file_len: group1.file_len, file_hash: group1.file_hash, files: group1.files.into_iter().chain(group2.files).collect(), }; let config = DedupeConfig { isolated_roots: vec![Path::from(&root1), Path::from(&root2)], ..DedupeConfig::default() }; let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let p = partition(group, &config, &StdLog::new()).unwrap(); assert_eq!(p.to_drop.len(), 3); assert!(p .to_drop .iter() .all(|f| f.path.to_path_buf().starts_with(&root2))); assert_eq!(p.to_keep.len(), 3); assert!(p .to_keep .iter() .all(|f| f.path.to_path_buf().starts_with(&root1))); }) } #[test] fn test_partition_respects_links() { with_dir("dedupe/partition/links", |root| { let root_a = root.join("root_a"); let root_b = root.join("root_b"); create_dir(&root_a).unwrap(); create_dir(&root_b).unwrap(); let file_a1 = root_a.join("file_a1"); let file_a2 = root_a.join("file_a2"); write_file(&file_a1, "aaa"); fs::hard_link(&file_a1, &file_a2).unwrap(); let file_b1 = root_b.join("file_b1"); let file_b2 = root_b.join("file_b2"); write_file(&file_b1, "aaa"); fs::hard_link(&file_b1, &file_b2).unwrap(); let group = FileGroup { file_len: FileLen(3), file_hash: FileHash::from_str("00").unwrap(), files: vec![ Path::from(&file_b1), Path::from(&file_a2), Path::from(&file_a1), Path::from(&file_b2), ], }; let config = DedupeConfig::default(); let group = group.map(|p| PathAndMetadata::new(p).unwrap()); let p = partition(group, &config, &StdLog::new()).unwrap(); // drop A files because file_a2 appears after file_b1 in the files vector assert_eq!(p.to_drop.len(), 2); assert!(p .to_drop .iter() .all(|f| f.path.to_path_buf().starts_with(&root_a))); assert_eq!(p.to_keep.len(), 2); assert!(p .to_keep .iter() .all(|f| f.path.to_path_buf().starts_with(&root_b))); }) } #[test] fn test_run_dedupe_script() { with_dir("dedupe/partition/run_dedupe_script", |root| { let mut log = StdLog::new(); log.no_progress = true; log.log_stderr_to_stdout = true; let group = make_group(root, FileHash::from_str("00").unwrap()); let config = DedupeConfig { priority: vec![Priority::LeastRecentlyModified], ..DedupeConfig::default() }; let script = dedupe(vec![group], DedupeOp::Remove, &config, &log); let dedupe_result = run_script(script, !config.no_lock, &log); assert_eq!(dedupe_result.processed_count, 2); assert!(!root.join("file_1").exists()); assert!(!root.join("file_2").exists()); assert!(root.join("file_3").exists()); }); } #[test] fn test_log_dedupe_script() { with_dir("dedupe/partition/log_dedupe_script", |root| { let mut log = StdLog::new(); log.no_progress = true; log.log_stderr_to_stdout = true; let group_1 = make_group(&root.join("group_1"), FileHash::from_str("00").unwrap()); let group_2 = make_group(&root.join("group_2"), FileHash::from_str("01").unwrap()); let group_3 = make_group(&root.join("group_3"), FileHash::from_str("02").unwrap()); let groups = vec![group_1, group_2, group_3]; let config = DedupeConfig { priority: vec![Priority::LeastRecentlyModified], ..DedupeConfig::default() }; let script = dedupe(groups, DedupeOp::Remove, &config, &log); let mut out = Vec::new(); let dedupe_result = log_script(script, &mut out).unwrap(); assert_eq!(dedupe_result.processed_count, 6); let out = String::from_utf8(out).unwrap(); let out_lines = out.lines().collect_vec(); assert_eq!(out_lines.len(), 6); assert!(out_lines[0].contains("group_1")); assert!(out_lines[1].contains("group_1")); assert!(out_lines[2].contains("group_2")); assert!(out_lines[3].contains("group_2")); assert!(out_lines[4].contains("group_3")); assert!(out_lines[5].contains("group_3")); }); } #[test] fn test_hard_link_merges_subgroups_of_hard_links() { with_dir("dedupe/merge_subgroups_of_hardlinks", |root| { let mut log = StdLog::new(); log.no_progress = true; log.log_stderr_to_stdout = true; let file_a1 = root.join("file_a1"); let file_a2 = root.join("file_a2"); let file_b1 = root.join("file_b1"); let file_b2 = root.join("file_b2"); write_file(&file_a1, "foo"); write_file(&file_b1, "foo"); let file_id = FileId::new(&Path::from(&file_a1)).unwrap(); fs::hard_link(&file_a1, &file_a2).unwrap(); fs::hard_link(&file_b1, &file_b2).unwrap(); let group_config = GroupConfig { paths: vec![Path::from(root)], ..GroupConfig::default() }; let groups = group_files(&group_config, &log).unwrap(); let dedupe_config = DedupeConfig::default(); let script = dedupe(groups, DedupeOp::HardLink, &dedupe_config, &log); let dedupe_result = run_script(script, false, &log); assert_eq!(dedupe_result.processed_count, 2); assert!(file_a1.exists()); assert!(file_a2.exists()); assert!(file_b1.exists()); assert!(file_b2.exists()); assert_eq!(read_file(&file_a1), "foo"); assert_eq!(FileId::new(&Path::from(&file_a2)).unwrap(), file_id); assert_eq!(FileId::new(&Path::from(&file_b1)).unwrap(), file_id); assert_eq!(FileId::new(&Path::from(&file_b2)).unwrap(), file_id); }) } #[test] #[cfg(unix)] fn test_remove_removes_subgroups_of_soft_links() { use std::os::unix::fs; with_dir("dedupe/remove_subgroups_with_symlinks", |root| { let mut log = StdLog::new(); log.no_progress = true; log.log_stderr_to_stdout = true; let file_a1 = root.join("file_a1"); let file_a2 = root.join("file_a2"); let file_b1 = root.join("file_b1"); let file_b2 = root.join("file_b2"); write_file(&file_a1, "foo"); write_file(&file_b1, "foo"); fs::symlink(&file_a1, &file_a2).unwrap(); fs::symlink(&file_b1, &file_b2).unwrap(); let group_config = GroupConfig { paths: vec![Path::from(root)], symbolic_links: true, ..GroupConfig::default() }; let groups = group_files(&group_config, &log).unwrap(); let dedupe_config = DedupeConfig::default(); let script = dedupe(groups, DedupeOp::Remove, &dedupe_config, &log); let dedupe_result = run_script(script, false, &log); assert_eq!(dedupe_result.processed_count, 2); assert!(file_a1.exists()); assert!(file_a2.exists()); assert!(!file_b1.exists()); assert!(!file_b2.exists()); assert_eq!(read_file(&file_a1), "foo"); assert_eq!(read_file(&file_a2), "foo"); }) } } 0707010000000F000081A4000000000000000000000001653E86C200002FA3000000000000000000000000000000000000002500000000fclones-0.34.0/fclones/src/device.rsuse core::cmp; use std::collections::HashMap; use std::ffi::{OsStr, OsString}; use std::ops::Index; use itertools::Itertools; use lazy_init::Lazy; use rayon::{ThreadPool, ThreadPoolBuilder}; use sysinfo::{DiskExt, DiskKind, System, SystemExt}; use crate::config::Parallelism; use crate::file::FileLen; use crate::path::Path; impl Parallelism { pub fn default_for(disk_kind: DiskKind) -> Parallelism { let cpu_count = num_cpus::get(); match disk_kind { // SSDs typically benefit from a lot of parallelism. // Some users will probably want to increase it even more. DiskKind::SSD => Parallelism { random: 4 * cpu_count, sequential: 4 * cpu_count, }, // Rotational drives can't serve multiple requests at once. // After introducing access ordering in fclones 0.9.0 it turns out // that we get slightly more IOPS when we schedule random access operations on a // single thread. For sequential scanning of big files, parallel access can hurt a lot, // so 1 is the only possible choice here. DiskKind::HDD => Parallelism { random: 1, sequential: 1, }, // Unknown device here, so we need to stay away from potentially extremely bad defaults. // If the underlying device is an SSD, a single-threaded mode can // degrade random I/O performance many times. On the other hand making parallel random // access to a HDD didn't degrade performance by more than 30% in our tests, // and sometimes it can speed things up. // For sequential reads of big files we obviously stay single threaded, // as multithreading can hurt really a lot in case the underlying device is rotational. _ => Parallelism { random: 4 * cpu_count, sequential: 1, }, } } } pub struct DiskDevice { pub index: usize, pub name: OsString, pub disk_kind: DiskKind, pub file_system: String, pub parallelism: Parallelism, seq_thread_pool: Lazy<ThreadPool>, rand_thread_pool: Lazy<ThreadPool>, } impl DiskDevice { fn new( index: usize, name: OsString, disk_kind: DiskKind, file_system: String, parallelism: Parallelism, ) -> DiskDevice { DiskDevice { index, name, disk_kind, file_system, parallelism, seq_thread_pool: Lazy::new(), rand_thread_pool: Lazy::new(), } } fn build_thread_pool(num_threads: usize) -> ThreadPool { ThreadPoolBuilder::default() .num_threads(num_threads) .build() .unwrap() } pub fn seq_thread_pool(&self) -> &ThreadPool { self.seq_thread_pool .get_or_create(|| Self::build_thread_pool(self.parallelism.sequential)) } pub fn rand_thread_pool(&self) -> &ThreadPool { self.rand_thread_pool .get_or_create(|| Self::build_thread_pool(self.parallelism.random)) } pub fn min_prefix_len(&self) -> FileLen { FileLen(match self.disk_kind { DiskKind::SSD => 4 * 1024, DiskKind::HDD => 4 * 1024, DiskKind::Unknown(_) => 4 * 1024, }) } pub fn max_prefix_len(&self) -> FileLen { FileLen(match self.disk_kind { DiskKind::SSD => 4 * 1024, DiskKind::HDD => 16 * 1024, DiskKind::Unknown(_) => 16 * 1024, }) } pub fn suffix_len(&self) -> FileLen { self.max_prefix_len() } pub fn suffix_threshold(&self) -> FileLen { FileLen(match self.disk_kind { DiskKind::HDD => 64 * 1024 * 1024, // 64 MB DiskKind::SSD => 64 * 1024, // 64 kB DiskKind::Unknown(_) => 64 * 1024 * 1024, }) } } /// Finds disk devices by file paths pub struct DiskDevices { devices: Vec<DiskDevice>, mount_points: Vec<(Path, usize)>, } impl DiskDevices { #[cfg(test)] pub fn single(disk_kind: DiskKind, parallelism: usize) -> DiskDevices { let device = DiskDevice::new( 0, OsString::from("/"), disk_kind, String::from("unknown"), Parallelism { random: parallelism, sequential: parallelism, }, ); DiskDevices { devices: vec![device], mount_points: vec![(Path::from("/"), 0)], } } /// Reads the preferred parallelism level for the device based on the /// device name or the device type (ssd/hdd) from `pool_sizes` map. /// Returns the value under the "default" key if device was not found, /// or 0 if "default" doesn't exist in the map. /// If found, the device key is removed from the map. fn get_parallelism( name: &OsStr, disk_kind: DiskKind, pool_sizes: &HashMap<OsString, Parallelism>, ) -> Parallelism { let mut dev_key = OsString::new(); dev_key.push("dev:"); dev_key.push(name); match pool_sizes.get(&dev_key) { Some(p) => *p, None => { let p = match disk_kind { DiskKind::SSD => pool_sizes.get(OsStr::new("ssd")), DiskKind::HDD => pool_sizes.get(OsStr::new("hdd")), DiskKind::Unknown(_) => pool_sizes.get(OsStr::new("unknown")), }; match p { Some(p) => *p, None => pool_sizes .get(OsStr::new("default")) .copied() .unwrap_or_else(|| Parallelism::default_for(disk_kind)), } } } } /// If the device doesn't exist, adds a new device to devices vector and returns its index. /// If the device already exists, it returns the index of the existing device. fn add_device( &mut self, name: OsString, disk_kind: DiskKind, file_system: String, pool_sizes: &HashMap<OsString, Parallelism>, ) -> usize { if let Some((index, _)) = self.devices.iter().find_position(|d| d.name == name) { index } else { let index = self.devices.len(); let parallelism = Self::get_parallelism(&name, disk_kind, pool_sizes); self.devices.push(DiskDevice::new( index, name, disk_kind, file_system, parallelism, )); index } } /// If `name` is a disk partition, it attempts to return the disk device name the partition /// resides on. Otherwise, and on failures, it just returns the same `name`. #[cfg(target_os = "linux")] fn physical_device_name(name: &OsStr) -> OsString { let regex = regex::Regex::new(r"^/dev/([fhs]d[a-z]|nvme[0-9]+).*").unwrap(); let name_str = name.to_string_lossy(); match regex.captures(name_str.as_ref()) { Some(captures) => { let parent = "/dev/".to_owned() + captures.get(1).unwrap().as_str(); OsString::from(parent) } None => name.to_os_string(), } } #[cfg(not(target_os = "linux"))] fn physical_device_name(name: &OsStr) -> OsString { name.to_os_string() } /// Reads the list of partitions and disks from the system and builds the `DiskDevices` /// structure from that information. pub fn new(pool_sizes: &HashMap<OsString, Parallelism>) -> DiskDevices { let mut sys = System::new(); sys.refresh_disks_list(); let mut result = DiskDevices { devices: Vec::new(), mount_points: Vec::new(), }; // Default device used when we don't find any real device result.add_device( OsString::from("default"), DiskKind::Unknown(-1), String::from("unknown"), pool_sizes, ); for d in sys.disks() { let device_name = Self::physical_device_name(d.name()); let index = result.add_device( device_name, d.kind(), String::from_utf8_lossy(d.file_system()).to_string(), pool_sizes, ); // On macOS APFS disk users' data is mounted in '/System/Volumes/Data' // but fused transparently and presented as part of the root filesystem. // It requires remapping Data volume path for this DiskDevice to '/'. // https://www.swiftforensics.com/2019/10/macos-1015-volumes-firmlink-magic.html // https://eclecticlight.co/2020/01/23/catalina-boot-volumes/ if cfg!(target_os = "macos") && d.file_system() == b"apfs" && d.mount_point().to_str() == Some("/System/Volumes/Data") { result.mount_points.push((Path::from("/"), index)); } else { result .mount_points .push((Path::from(d.mount_point()), index)); }; } result .mount_points .sort_by_key(|(p, _)| cmp::Reverse(p.component_count())); result } /// Returns the mount point holding given path pub fn get_mount_point(&self, path: &Path) -> &Path { self.mount_points .iter() .map(|(p, _)| p) .find(|p| p.is_prefix_of(path)) .unwrap_or(&self.mount_points[0].0) } /// Returns the disk device which holds the given path pub fn get_by_path(&self, path: &Path) -> &DiskDevice { self.mount_points .iter() .find(|(p, _)| p.is_prefix_of(path)) .map(|&(_, index)| &self.devices[index]) .unwrap_or(&self.devices[0]) } /// Returns the disk device by its device name (not mount point) pub fn get_by_name(&self, name: &OsStr) -> Option<&DiskDevice> { self.devices.iter().find(|&d| d.name == name) } /// Returns the first device on the list pub fn get_default(&self) -> &DiskDevice { &self.devices[0] } /// Returns the number of devices pub fn len(&self) -> usize { self.devices.len() } /// Always returns false, because the default device is guaranteed to exist pub fn is_empty(&self) -> bool { assert!(self.devices.is_empty()); false } /// Returns an iterator over devices pub fn iter(&self) -> impl Iterator<Item = &DiskDevice> { self.devices.iter() } /// Returns device_group identifiers recognized by the constructor pub fn device_types() -> Vec<&'static str> { vec!["ssd", "hdd", "removable", "unknown"] } } impl Default for DiskDevices { fn default() -> Self { let pool_sizes = HashMap::new(); Self::new(&pool_sizes) } } impl Index<usize> for DiskDevices { type Output = DiskDevice; fn index(&self, index: usize) -> &Self::Output { &self.devices[index] } } #[cfg(test)] mod test { use super::*; #[test] #[cfg_attr(not(target_os = "linux"), ignore)] fn test_physical_device_name() { assert_eq!( DiskDevices::physical_device_name(OsStr::new("/dev/sda")), OsString::from("/dev/sda") ); assert_eq!( DiskDevices::physical_device_name(OsStr::new("/dev/sda1")), OsString::from("/dev/sda") ); assert_eq!( DiskDevices::physical_device_name(OsStr::new("/dev/hdc20")), OsString::from("/dev/hdc") ); assert_eq!( DiskDevices::physical_device_name(OsStr::new("/dev/nvme0n1p3")), OsString::from("/dev/nvme0") ); assert_eq!( DiskDevices::physical_device_name(OsStr::new("/dev/unknown")), OsString::from("/dev/unknown") ); } } 07070100000010000081A4000000000000000000000001653E86C2000003E0000000000000000000000000000000000000002400000000fclones-0.34.0/fclones/src/error.rsuse std::fmt::{Display, Formatter}; use std::{fmt, io}; /// Error reported by top-level fclones functions #[derive(Debug)] pub struct Error { pub message: String, } impl Error { pub fn new(msg: String) -> Error { Error { message: msg } } } impl std::error::Error for Error {} impl Display for Error { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}", self.message) } } impl From<String> for Error { fn from(s: String) -> Self { Error::new(s) } } impl From<&str> for Error { fn from(s: &str) -> Self { Error::new(s.to_owned()) } } /// Returns error kind. /// Maps `libc::ENOTSUP` and `libc::EOPNOTSUPP` errors to `ErrorKind::Unsupported` on Unix. pub fn error_kind(error: &io::Error) -> io::ErrorKind { #[cfg(unix)] #[allow(unreachable_patterns)] if let Some(libc::ENOTSUP | libc::EOPNOTSUPP) = error.raw_os_error() { return io::ErrorKind::Unsupported; } error.kind() } 07070100000011000081A4000000000000000000000001653E86C200003B9B000000000000000000000000000000000000002300000000fclones-0.34.0/fclones/src/file.rs//! Type-safe wrappers for file position and length and other //! file-system related utilities. use core::fmt; use std::fmt::Display; use std::hash::Hash; use std::io::{ErrorKind, SeekFrom}; use std::iter::Sum; use std::ops::{Add, AddAssign, BitXor, Deref, Mul, Sub, SubAssign}; use std::{fs, io}; use byte_unit::Byte; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use bytesize::ByteSize; use hex::FromHexError; use itertools::{EitherOrBoth, Itertools}; use rayon::iter::{IntoParallelRefIterator, IntoParallelRefMutIterator, ParallelIterator}; use serde::*; use smallvec::alloc::fmt::Formatter; use smallvec::alloc::str::FromStr; use crate::device::DiskDevices; use crate::group::FileGroup; use crate::log::{Log, LogExt}; use crate::path::Path; /// Represents data position in the file, counted from the beginning of the file, in bytes. /// Provides more type safety and nicer formatting over using a raw u64. /// Offsets are formatted as hex. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Serialize, Deserialize)] pub struct FilePos(pub u64); impl FilePos { pub fn zero() -> FilePos { FilePos(0) } } impl Display for FilePos { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } impl From<u64> for FilePos { fn from(pos: u64) -> Self { FilePos(pos) } } impl From<usize> for FilePos { fn from(pos: usize) -> Self { FilePos(pos as u64) } } impl From<FilePos> for u64 { fn from(pos: FilePos) -> Self { pos.0 } } impl From<FilePos> for usize { fn from(pos: FilePos) -> Self { pos.0 as usize } } impl From<FilePos> for SeekFrom { fn from(pos: FilePos) -> Self { SeekFrom::Start(pos.0) } } impl Add<FileLen> for FilePos { type Output = FilePos; fn add(self, rhs: FileLen) -> Self::Output { FilePos(self.0 + rhs.0) } } impl Sub<FileLen> for FilePos { type Output = FilePos; fn sub(self, rhs: FileLen) -> Self::Output { FilePos(self.0 - rhs.0) } } /// Represents length of data, in bytes. /// Provides more type safety and nicer formatting over using a raw u64. #[derive( Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Deserialize, Serialize, Default, )] pub struct FileLen(pub u64); impl FileLen { pub const MAX: FileLen = FileLen(u64::MAX); pub fn as_pos(self) -> FilePos { FilePos(self.0) } } impl From<u64> for FileLen { fn from(l: u64) -> Self { FileLen(l) } } impl From<usize> for FileLen { fn from(l: usize) -> Self { FileLen(l as u64) } } impl From<FileLen> for u64 { fn from(l: FileLen) -> Self { l.0 } } impl From<FileLen> for usize { fn from(l: FileLen) -> Self { l.0 as usize } } impl Add for FileLen { type Output = FileLen; fn add(self, rhs: Self) -> Self::Output { FileLen(self.0 + rhs.0) } } impl AddAssign for FileLen { fn add_assign(&mut self, rhs: Self) { self.0 += rhs.0 } } impl Sub for FileLen { type Output = FileLen; fn sub(self, rhs: Self) -> Self::Output { FileLen(self.0 - rhs.0) } } impl SubAssign for FileLen { fn sub_assign(&mut self, rhs: Self) { self.0 -= rhs.0 } } impl Mul<u64> for FileLen { type Output = FileLen; fn mul(self, rhs: u64) -> Self::Output { FileLen(self.0 * rhs) } } impl Sum<FileLen> for FileLen { fn sum<I: Iterator<Item = FileLen>>(iter: I) -> Self { iter.fold(FileLen(0), |a, b| a + b) } } impl FromStr for FileLen { type Err = byte_unit::ByteError; fn from_str(s: &str) -> Result<Self, Self::Err> { let b = Byte::from_str(s)?; Ok(FileLen(b.get_bytes() as u64)) } } impl Display for FileLen { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.pad(format!("{}", ByteSize(self.0)).as_str()) } } /// A file chunk to be hashed pub struct FileChunk<'a> { pub path: &'a Path, pub pos: FilePos, pub len: FileLen, } impl FileChunk<'_> { pub fn new(path: &Path, pos: FilePos, len: FileLen) -> FileChunk<'_> { FileChunk { path, pos, len } } } #[cfg(unix)] pub type InodeId = u64; #[cfg(windows)] pub type InodeId = u128; /// Useful for identifying files in presence of hardlinks #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)] pub struct FileId { pub device: u64, pub inode: InodeId, } impl FileId { #[cfg(unix)] pub fn new(file: &Path) -> io::Result<FileId> { use std::os::unix::fs::MetadataExt; match fs::metadata(file.to_path_buf()) { Ok(metadata) => Ok(FileId { inode: metadata.ino(), device: metadata.dev(), }), Err(e) => Err(io::Error::new( e.kind(), format!("Failed to read metadata of {}: {}", file.display(), e), )), } } #[cfg(windows)] pub fn new(file: &Path) -> io::Result<FileId> { Self::from_file(&fs::File::open(file.to_path_buf())?).map_err(|_| { io::Error::new( ErrorKind::Other, format!( "Failed to read file identifier of {}: {}", file.display(), io::Error::last_os_error() ), ) }) } #[cfg(windows)] pub fn from_file(file: &fs::File) -> io::Result<FileId> { use std::os::windows::io::*; use winapi::ctypes::c_void; use winapi::um::fileapi::{ GetFileInformationByHandle, BY_HANDLE_FILE_INFORMATION, FILE_ID_INFO, }; use winapi::um::minwinbase::FileIdInfo; use winapi::um::winbase::GetFileInformationByHandleEx; let handle = file.as_raw_handle(); unsafe { let mut file_id: FILE_ID_INFO = std::mem::zeroed(); let file_id_ptr = (&mut file_id) as *mut _ as *mut c_void; const FILE_ID_SIZE: u32 = std::mem::size_of::<FILE_ID_INFO>() as u32; if GetFileInformationByHandleEx(handle, FileIdInfo, file_id_ptr, FILE_ID_SIZE) != 0 { return Ok(FileId { device: file_id.VolumeSerialNumber as u64, inode: u128::from_be_bytes(file_id.FileId.Identifier), }); } let mut file_info: BY_HANDLE_FILE_INFORMATION = std::mem::zeroed(); let file_info_ptr = (&mut file_info) as *mut _; if GetFileInformationByHandle(handle, file_info_ptr) != 0 { return Ok(FileId { device: file_info.dwVolumeSerialNumber as u64, inode: ((file_info.nFileIndexHigh as u128) << 32) | file_info.nFileIndexLow as u128, }); } Err(io::Error::new( ErrorKind::Other, format!( "Failed to read file identifier: {}", io::Error::last_os_error() ), )) } } #[cfg(unix)] pub fn from_metadata(metadata: &fs::Metadata) -> FileId { use std::os::unix::fs::MetadataExt; FileId { inode: metadata.ino(), device: metadata.dev(), } } pub fn of(f: impl AsRef<FileId>) -> FileId { *f.as_ref() } } /// Convenience wrapper for accessing OS-dependent metadata like inode and device-id #[derive(Debug, Clone)] pub struct FileMetadata { id: FileId, metadata: fs::Metadata, } impl FileMetadata { pub fn new(path: &Path) -> io::Result<FileMetadata> { let path_buf = path.to_path_buf(); let metadata = fs::metadata(path_buf).map_err(|e| { io::Error::new( e.kind(), format!("Failed to read metadata of {}: {}", path.display(), e), ) })?; #[cfg(unix)] let id = FileId::from_metadata(&metadata); #[cfg(windows)] let id = FileId::new(&path)?; Ok(FileMetadata { id, metadata }) } pub fn len(&self) -> FileLen { FileLen(self.metadata.len()) } pub fn file_id(&self) -> FileId { self.id } pub fn device_id(&self) -> u64 { self.id.device } pub fn inode_id(&self) -> InodeId { self.id.inode } } impl Deref for FileMetadata { type Target = fs::Metadata; fn deref(&self) -> &Self::Target { &self.metadata } } impl AsRef<FileId> for FileMetadata { fn as_ref(&self) -> &FileId { &self.id } } #[derive(Clone, Debug, Eq, PartialEq)] pub struct FileInfo { pub path: Path, pub id: FileId, pub len: FileLen, // physical on-disk location of file data for access ordering optimisation // the highest 16 bits encode the device id pub(crate) location: u64, } const OFFSET_MASK: u64 = 0x0000FFFFFFFFFFFF; #[cfg(target_os = "linux")] const DEVICE_MASK: u64 = 0xFFFF000000000000; impl FileInfo { fn new(path: Path, devices: &DiskDevices) -> io::Result<FileInfo> { let device_index = devices.get_by_path(&path).index as u64; let metadata = FileMetadata::new(&path)?; let file_len = metadata.len(); let id = metadata.id; let inode_id = metadata.inode_id(); #[allow(clippy::unnecessary_cast)] // cast needed when inode_id > 64 bits on some platforms Ok(FileInfo { path, id, len: file_len, location: device_index << 48 | (inode_id as u64) & OFFSET_MASK, }) } /// Returns the device index into the `DiskDevices` instance passed at creation pub fn get_device_index(&self) -> usize { (self.location >> 48) as usize } #[cfg(target_os = "linux")] pub fn fetch_physical_location(&mut self) -> io::Result<u64> { let new_location = get_physical_file_location(self.as_ref())?; if let Some(new_location) = new_location { self.location = self.location & DEVICE_MASK | (new_location >> 8) & OFFSET_MASK; } Ok(self.location) } } impl AsRef<FileId> for FileInfo { fn as_ref(&self) -> &FileId { &self.id } } impl AsRef<Path> for FileInfo { fn as_ref(&self) -> &Path { &self.path } } impl From<FileInfo> for Path { fn from(info: FileInfo) -> Self { info.path } } /// Returns file information for the given path. /// On failure, logs an error to stderr and returns `None`. pub(crate) fn file_info_or_log_err( file: Path, devices: &DiskDevices, log: &dyn Log, ) -> Option<FileInfo> { match FileInfo::new(file, devices) { Ok(info) => Some(info), Err(e) if e.kind() == ErrorKind::NotFound => None, Err(e) => { log.warn(e); None } } } /// Returns the physical offset of the first data block of the file #[cfg(target_os = "linux")] pub(crate) fn get_physical_file_location(path: &Path) -> io::Result<Option<u64>> { use crate::rlimit::RLIMIT_OPEN_FILES; let mut extents = { let _open_files_guard = RLIMIT_OPEN_FILES.clone().access_owned(); fiemap::fiemap(path.to_path_buf())? }; match extents.next() { Some(fe) => Ok(Some(fe?.fe_physical)), None => Ok(None), } } #[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] pub struct FileHash(Box<[u8]>); impl FileHash { pub fn u128_prefix(&self) -> u128 { self.0 .as_ref() .read_u128::<LittleEndian>() .expect("Hash must be at least 128-bit long") } } pub trait AsFileHash { fn as_file_hash(&self) -> &FileHash; } impl AsFileHash for FileHash { fn as_file_hash(&self) -> &FileHash { self } } impl<T> AsFileHash for (T, FileHash) { fn as_file_hash(&self) -> &FileHash { &self.1 } } impl Display for FileHash { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.pad(hex::encode(&self.0).as_str()) } } impl From<&[u8]> for FileHash { fn from(bytes: &[u8]) -> Self { FileHash(bytes.into()) } } impl From<u128> for FileHash { fn from(hash: u128) -> Self { let mut bytes: Vec<u8> = vec![]; bytes.write_u128::<LittleEndian>(hash).unwrap(); FileHash(bytes.into_boxed_slice()) } } impl FromStr for FileHash { type Err = FromHexError; fn from_str(s: &str) -> Result<Self, Self::Err> { Ok(FileHash(hex::decode(s)?.into_boxed_slice())) } } impl BitXor for FileHash { type Output = Self; fn bitxor(self, rhs: Self) -> Self::Output { FileHash( self.0 .iter() .zip_longest(rhs.0.as_ref()) .map(|r| match r { EitherOrBoth::Both(a, b) => a ^ b, _ => 0, }) .collect_vec() .into_boxed_slice(), ) } } impl Serialize for FileHash { fn serialize<S>(&self, serializer: S) -> Result<<S as Serializer>::Ok, <S as Serializer>::Error> where S: Serializer, { serializer.collect_str(self.to_string().as_str()) } } impl<'de> Deserialize<'de> for FileHash { fn deserialize<D>(deserializer: D) -> Result<FileHash, D::Error> where D: Deserializer<'de>, { let s = String::deserialize(deserializer)?; let h = FileHash::from_str(&s).map_err(de::Error::custom)?; Ok(h) } } /// Makes it possible to operate generically on collections of files, regardless /// of the way how the collection is implemented. We sometimes need to work on grouped files /// but sometimes we just have a flat vector. pub(crate) trait FileCollection { /// Returns the number of files in the collection fn count(&self) -> usize; /// Returns the total size of files in the collection fn total_size(&self) -> FileLen; /// Performs given action on each file in the collection fn for_each_mut<OP>(&mut self, op: OP) where OP: Fn(&mut FileInfo) + Sync + Send; } impl FileCollection for Vec<FileInfo> { fn count(&self) -> usize { self.len() } fn total_size(&self) -> FileLen { self.par_iter().map(|f| f.len).sum() } fn for_each_mut<OP>(&mut self, op: OP) where OP: Fn(&mut FileInfo) + Sync + Send, { self.par_iter_mut().for_each(op) } } impl FileCollection for Vec<FileGroup<FileInfo>> { fn count(&self) -> usize { self.iter().map(|g| g.file_count()).sum() } fn total_size(&self) -> FileLen { self.par_iter().map(|g| g.total_size()).sum() } fn for_each_mut<OP>(&mut self, op: OP) where OP: Fn(&mut FileInfo) + Sync + Send, { self.par_iter_mut().flat_map(|g| &mut g.files).for_each(op) } } #[derive(Copy, Clone, Debug)] pub(crate) enum FileAccess { Sequential, Random, } #[cfg(test)] mod test { use super::*; #[test] fn test_format_bytes() { let file_len = FileLen(16000); let human_readable = format!("{file_len}"); assert_eq!(human_readable, "16.0 KB"); } } 07070100000012000081A4000000000000000000000001653E86C2000126A8000000000000000000000000000000000000002400000000fclones-0.34.0/fclones/src/group.rs//! Grouping identical files together. use std::cell::RefCell; use std::cmp::{max, min, Reverse}; use std::collections::BTreeMap; use std::collections::HashMap; use std::env::{args_os, current_dir}; use std::ffi::{OsStr, OsString}; use std::fmt::Debug; use std::fs::File; use std::hash::Hash; use std::io; use std::io::BufWriter; use std::iter::FromIterator; use std::marker::PhantomData; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Arc; use chrono::{DateTime, Local}; use console::Term; use crossbeam_utils::thread; use indexmap::IndexMap; use itertools::Itertools; use rayon::prelude::*; use serde::*; use smallvec::SmallVec; use thread_local::ThreadLocal; use crate::arg::Arg; use crate::config::*; use crate::device::{DiskDevice, DiskDevices}; use crate::error::Error; use crate::file::*; use crate::hasher::FileHasher; use crate::log::{Log, LogExt, ProgressBarLength}; use crate::path::Path; use crate::phase::{Phase, Phases}; use crate::report::{FileStats, ReportHeader, ReportWriter}; use crate::rlimit::RLIMIT_OPEN_FILES; use crate::selector::PathSelector; use crate::semaphore::Semaphore; use crate::walk::Walk; /// Groups items by key. /// After all items have been added, this structure can be transformed into /// an iterator over groups. /// The order of groups in the output iterator is not defined. /// The order of items in each group matches the order of adding the items by a thread. /// /// Internally uses a hash map. /// The amortized complexity of adding an item is O(1). /// The complexity of reading all groups is O(N). /// struct GroupMap<T, K, V, F> where K: PartialEq + Hash, F: Fn(T) -> (K, V), { item_type: PhantomData<T>, groups: BTreeMap<K, SmallVec<[V; 1]>>, split_fn: F, } impl<T, K, V, F> GroupMap<T, K, V, F> where K: Eq + Hash + Ord, F: Fn(T) -> (K, V), { /// Creates a new empty map. /// /// # Arguments /// * `split_fn` - a function generating the key-value pair for each input item pub fn new(split_fn: F) -> GroupMap<T, K, V, F> { GroupMap { item_type: PhantomData, groups: BTreeMap::new(), split_fn, } } /// Adds an item to the map. /// Note, this doesn't take `&mut self` so this can be called from safely from many threads. pub fn add(&mut self, item: T) { let (key, new_item) = (self.split_fn)(item); self.groups.entry(key).or_default().push(new_item); } } impl<T, K, V, F> IntoIterator for GroupMap<T, K, V, F> where K: Eq + Hash, F: Fn(T) -> (K, V), { type Item = (K, SmallVec<[V; 1]>); type IntoIter = <BTreeMap<K, SmallVec<[V; 1]>> as IntoIterator>::IntoIter; fn into_iter(self) -> Self::IntoIter { self.groups.into_iter() } } /// Holds stuff needed globally by the whole application struct GroupCtx<'a> { pub config: &'a GroupConfig, pub log: &'a dyn Log, phases: Phases, group_filter: FileGroupFilter, devices: DiskDevices, path_selector: PathSelector, hasher: FileHasher<'a>, } impl<'a> GroupCtx<'a> { pub fn new(config: &'a GroupConfig, log: &'a dyn Log) -> Result<GroupCtx<'a>, Error> { let phases = if config.transform.is_some() { Phases::new(vec![ Phase::Walk, Phase::FetchExtents, Phase::TransformAndGroup, ]) } else { Phases::new(vec![ Phase::Walk, Phase::GroupBySize, Phase::FetchExtents, Phase::GroupByPrefix, Phase::GroupBySuffix, Phase::GroupByContents, ]) }; let thread_pool_sizes = config.thread_pool_sizes(); let devices = DiskDevices::new(&thread_pool_sizes); let transform = match config.transform() { None => None, Some(Ok(transform)) => Some(transform), Some(Err(e)) => return Err(Error::new(format!("Invalid transform: {e}"))), }; let base_dir = Path::from(current_dir().unwrap_or_default()); let group_filter = config.group_filter(); let path_selector = config .path_selector(&base_dir) .map_err(|e| format!("Invalid pattern: {e}"))?; let hasher = if config.cache { FileHasher::new_cached(config.hash_fn, transform, log)? } else { FileHasher::new(config.hash_fn, transform, log) }; Self::check_pool_config(thread_pool_sizes, &devices)?; Ok(GroupCtx { config, log, phases, group_filter, devices, path_selector, hasher, }) } /// Checks if all thread pool names refer to existing pools or devices fn check_pool_config( thread_pool_sizes: HashMap<OsString, Parallelism>, devices: &DiskDevices, ) -> Result<(), Error> { let mut allowed_pool_names = DiskDevices::device_types(); allowed_pool_names.push("main"); allowed_pool_names.push("default"); for (name, _) in thread_pool_sizes.iter() { let name = name.to_string_lossy(); match name.strip_prefix("dev:") { Some(name) if devices.get_by_name(OsStr::new(name)).is_none() => { return Err(Error::new(format!("Unknown device: {name}"))); } None if !allowed_pool_names.contains(&name.as_ref()) => { return Err(Error::new(format!( "Unknown thread pool or device type: {name}" ))); } _ => {} } } Ok(()) } } /// A group of files that have something in common, e.g. same size or same hash #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone)] pub struct FileGroup<F> { /// Length of each file pub file_len: FileLen, /// Hash of a part or the whole of the file pub file_hash: FileHash, /// Group of files with the same length and hash pub files: Vec<F>, } /// Controls the type of search by determining the number of replicas /// allowed in a group of identical files. #[derive(Debug)] pub enum Replication { /// Looks for under-replicated files with replication factor lower than the specified number. /// `Underreplicated(2)` means searching for unique files. /// `Underreplicated(3)` means searching for file groups containing fewer than 3 replicas. Underreplicated(usize), /// Looks for over-replicated files with replication factor higher than the specified number. /// `Overreplicated(1)` means searching for duplicates. Overreplicated(usize), } /// Controls filtering of file groups between the search stages, as well as determines which file /// groups are reported in the final report. /// /// For example, when searching for duplicates, groups containing only a single file can be safely /// discarded. /// /// This is to be configured from the command line parameters set by the user. #[derive(Debug)] pub struct FileGroupFilter { /// The allowed number of replicas in the group. pub replication: Replication, /// A list of path prefixes for grouping files into isolated subgroups. /// Files inside a single subgroup are treated like a single replica. /// If empty - no additional grouping is performed. /// See [`GroupConfig::isolate`]. pub root_paths: Vec<Path>, /// If set to true, files with the same `FileId` are counted as one pub group_by_id: bool, } impl<F> FileGroup<F> { /// Returns the count of all files in the group pub fn file_count(&self) -> usize { self.files.len() } /// Returns the total size of all files in the group pub fn total_size(&self) -> FileLen { self.file_len * self.file_count() as u64 } /// Maps the list of files in the group. /// Preserves the group file len and hash. pub fn map<R>(self, f: impl Fn(F) -> R) -> FileGroup<R> { FileGroup { file_len: self.file_len, file_hash: self.file_hash, files: self.files.into_iter().map(f).collect(), } } /// Transforms files into different type, filtering out files that cannot be transformed pub fn filter_map<R>(self, f: impl Fn(F) -> Option<R>) -> FileGroup<R> { FileGroup { file_len: self.file_len, file_hash: self.file_hash, files: self.files.into_iter().filter_map(f).collect(), } } /// Tries to map each file by given fallible function. /// Does not stop processing on the first failure. /// If mapping any of the files fails, then returns a vector of errors. pub fn try_map_all<R: Debug, E: Debug>( self, f: impl Fn(F) -> Result<R, E>, ) -> Result<FileGroup<R>, Vec<E>> { let (ok, err): (Vec<_>, Vec<_>) = self.files.into_iter().map(f).partition(Result::is_ok); if err.is_empty() { Ok(FileGroup { file_len: self.file_len, file_hash: self.file_hash, files: ok.into_iter().map(Result::unwrap).collect(), }) } else { Err(err.into_iter().map(Result::unwrap_err).collect()) } } /// Flat maps the list of files in the group. /// Preserves the group file len and hash. pub fn flat_map<R, I>(self, f: impl Fn(F) -> I) -> FileGroup<R> where I: IntoIterator<Item = R>, { FileGroup { file_len: self.file_len, file_hash: self.file_hash, files: self.files.into_iter().flat_map(f).collect(), } } /// Splits the group into one or more groups based on the key function applied to each file. /// Files with the same key are placed in the same group. /// The key is computed only once per item. /// File len and file hash are preserved. pub fn partition_by_key<K: Eq + Hash>(self, key_fn: impl Fn(&F) -> K) -> Vec<FileGroup<F>> { let mut groups = HashMap::new(); for f in self.files { let key = key_fn(&f); groups.entry(key).or_insert_with(Vec::new).push(f); } groups .into_values() .map(|files| FileGroup { file_len: self.file_len, file_hash: self.file_hash.clone(), files, }) .collect() } } impl<F: AsRef<Path> + core::fmt::Debug> FileGroup<F> { #[cfg(test)] fn paths(&self) -> Vec<Path> { self.files.iter().map(|f| f.as_ref().clone()).collect_vec() } } impl<F: AsRef<FileId>> FileGroup<F> { /// Returns the number of files with distinct identifiers. /// Files must be sorted by id. pub fn unique_count(&self) -> usize { self.files .iter() .dedup_by(|f1, f2| FileId::of(f1) == FileId::of(f2)) .count() } /// Returns the total size of data in files with distinct identifiers. /// Files must be sorted by id. pub fn unique_size(&self) -> FileLen { self.file_len * self.unique_count() as u64 } /// Sorts the files in this group by their identifiers. pub fn sort_by_id(&mut self) { self.files.sort_by_key(|f| FileId::of(f)); } } impl<F: AsRef<Path> + AsRef<FileId>> FileGroup<F> { /// Returns true if the file group should be forwarded to the next grouping stage, /// because the number of duplicate files is higher than the maximum allowed number of replicas. /// /// This method returns always true if the user searches for underreplicated files /// (`filter.replication` is `Replication::Underreplicated`). This is because even if /// the number of replicas is currently higher than the maximum number of allowed replicas, /// the group can be split in later stages and the number of replicas in the group may drop. pub fn matches(&self, filter: &FileGroupFilter) -> bool { match filter.replication { Replication::Overreplicated(rf) => self.subgroup_count(filter) > rf, Replication::Underreplicated(_) => true, } } /// Returns true if the file group should be included in the final report. /// The number of replicas in the group must be appropriate for the condition /// specified in `filter.replication`. pub fn matches_strictly(&self, filter: &FileGroupFilter) -> bool { let count = self.subgroup_count(filter); match filter.replication { Replication::Overreplicated(rf) => count > rf, Replication::Underreplicated(rf) => count < rf, } } /// Returns the number of missing file replicas. /// /// This is the difference between the desired minimum number of replicas /// given by `filter.replication` and the number of files in the group. /// /// If the number of files is greater than the minimum number of replicas, or /// if `filter.replication` is set to `Replication::Overreplicated` 0 is returned. pub fn missing_count(&self, filter: &FileGroupFilter) -> usize { match filter.replication { Replication::Overreplicated(_) => 0, Replication::Underreplicated(rf) => rf.saturating_sub(self.subgroup_count(filter)), } } /// Returns the highest number of redundant files that could be removed from the group. /// /// If `filter.roots` are empty, the difference between the total number of files /// in the group and the desired maximum number of replicas controlled by `filter.replication` /// is returned. /// /// If `filter.roots` are not empty, then files in the group are split into subgroups first, /// where each subgroup shares one of the roots. If the number of subgroups `N` is larger /// than the allowed number of replicas r, the last N - r subgroups are considered /// redundant. The total number of files in redundant subgroups is returned. /// /// If the result would be negative in any of the above cases or if `filter.replication` /// is set to `Replication::Underreplicated`, 0 is returned. pub fn redundant_count(&self, filter: &FileGroupFilter) -> usize { match filter.replication { Replication::Underreplicated(_) => 0, Replication::Overreplicated(rf) => { let rf = max(rf, 1); if filter.root_paths.is_empty() { // fast-path, equivalent to the code in the else branch, but way faster self.file_count().saturating_sub(rf) } else { let sub_groups = FileSubGroup::group(&self.files, &filter.root_paths, filter.group_by_id); let sub_group_lengths = sub_groups .into_iter() .map(|sg| sg.files.len()) .collect_vec(); let cutoff_index = min(rf, sub_group_lengths.len()); sub_group_lengths[cutoff_index..].iter().sum() } } } } /// Returns either the number of files redundant or missing, depending on the type of search. pub fn reported_count(&self, filter: &FileGroupFilter) -> usize { match filter.replication { Replication::Overreplicated(_) => self.redundant_count(filter), Replication::Underreplicated(_) => self.missing_count(filter), } } /// The number of subgroups of paths with distinct root prefix. fn subgroup_count(&self, filter: &FileGroupFilter) -> usize { FileSubGroup::group(&self.files, &filter.root_paths, filter.group_by_id).len() } /// Sorts the files by their path names. /// If filter requires grouping by roots, then groups are kept together. pub fn sort_by_path(&mut self, root_paths: &[Path]) { self.files.sort_by(|f1, f2| { let p1: &Path = f1.as_ref(); let p2: &Path = f2.as_ref(); p1.cmp(p2) }); if !root_paths.is_empty() { self.files = FileSubGroup::group(self.files.drain(..), root_paths, true) .into_iter() .flat_map(|g| g.files) .collect() } } } impl<T> AsRef<FileGroup<T>> for FileGroup<T> { fn as_ref(&self) -> &FileGroup<T> { self } } /// A subgroup of identical files, typically smaller than a `FileGroup`. /// A subgroup is formed by files sharing the same path prefix, e.g. files on the same volume. /// In terms of file deduplication activities, a subgroup is an atomic entity - /// all files in a subgroup must be either dropped or kept. #[derive(Debug, Eq, PartialEq)] pub struct FileSubGroup<F> { pub files: Vec<F>, } impl<F> FileSubGroup<F> { pub fn empty() -> FileSubGroup<F> { FileSubGroup { files: vec![] } } pub fn single(f: F) -> FileSubGroup<F> { FileSubGroup { files: vec![f] } } pub fn push(&mut self, file: F) { self.files.push(file) } } impl<F: AsRef<Path> + AsRef<FileId>> FileSubGroup<F> { /// Splits a group of files into subgroups. /// /// Files that share the same prefix found in the roots array are placed in the same subgroup. /// The result vector is ordered primarily by the roots, and files having the same root have /// the same order as they came from the input iterator. Files with paths that don't start /// with any of the root prefixes are placed last in the result, in the same order as the input. /// /// If `group_by_id` is set, files with the same `FileId` are also grouped together. /// In this case, the order of groups follows the order of input files, i.e. the input vector /// is scanned and a new group is appended at the end each time a file with a /// distinct id appears in the input. /// /// If both `roots` is not empty and `group_by_id` is set, /// grouping by prefixes takes precedence over grouping by identifiers, /// so a file with the same id can be placed in two different prefix groups. /// pub fn group( files: impl IntoIterator<Item = F>, roots: &[Path], group_by_id: bool, ) -> Vec<FileSubGroup<F>> { let mut prefix_groups = Vec::from_iter(roots.iter().map(|_| FileSubGroup::empty())); let mut id_groups = IndexMap::new(); // important: keep order of insertion for f in files { let path: &Path = f.as_ref(); let id: FileId = *f.as_ref(); let root_idx = roots.iter().position(|r| r.is_prefix_of(path)); match root_idx { Some(idx) => prefix_groups[idx].files.push(f), None if group_by_id => id_groups.entry(id).or_insert(FileSubGroup::empty()).push(f), None => prefix_groups.push(FileSubGroup::single(f)), } } prefix_groups.extend(id_groups.into_values()); prefix_groups.retain(|sg| !sg.files.is_empty()); prefix_groups } } /// Helper struct to preserve the original file hash and keep it together with file information /// Sometimes the old hash must be taken into account, e.g. when combining the prefix hash with /// the suffix hash. struct HashedFileInfo { file_hash: FileHash, file_info: FileInfo, } /// Partitions files into separate vectors, where each vector holds files persisted /// on the same disk device. The vectors are returned in the same order as devices. fn partition_by_devices( files: Vec<FileGroup<FileInfo>>, devices: &DiskDevices, ) -> Vec<Vec<HashedFileInfo>> { let mut result: Vec<Vec<HashedFileInfo>> = Vec::with_capacity(devices.len()); for _ in 0..devices.len() { result.push(Vec::new()); } for g in files { for f in g.files { let device = &devices[f.get_device_index()]; result[device.index].push(HashedFileInfo { file_hash: g.file_hash.clone(), file_info: f, }); } } result } /// Iterates over grouped files, in parallel fn flat_iter(files: &[FileGroup<FileInfo>]) -> impl ParallelIterator<Item = &FileInfo> { files.par_iter().flat_map(|g| &g.files) } /// Groups files by length and hash computed by given `hash_fn`. /// Runs in parallel on dedicated thread pools. /// Files on different devices are hashed separately from each other. /// File hashes within a single device are computed in the order given by /// their `location` field to minimize seek latency. /// /// Caveats: the original grouping is lost. It is possible for two files that /// were in the different groups to end up in the same group if they have the same length /// and they hash to the same value. If you don't want this, you need to combine the old /// hash with the new hash in the provided `hash_fn`. fn rehash<'a, F1, F2, H>( groups: Vec<FileGroup<FileInfo>>, group_pre_filter: F1, group_post_filter: F2, devices: &DiskDevices, access_type: FileAccess, hash_fn: H, ) -> Vec<FileGroup<FileInfo>> where F1: Fn(&FileGroup<FileInfo>) -> bool, F2: Fn(&FileGroup<FileInfo>) -> bool, H: Fn((&mut FileInfo, FileHash)) -> Option<FileHash> + Sync + Send + 'a, { // Allow sharing the hash function between threads: type HashFn<'a> = dyn Fn((&mut FileInfo, FileHash)) -> Option<FileHash> + Sync + Send + 'a; let hash_fn: &HashFn<'a> = &hash_fn; let (tx, rx): (Sender<HashedFileInfo>, Receiver<HashedFileInfo>) = channel(); // There is no point in processing groups containing a single file. // Normally when searching for duplicates such groups are filtered out automatically after // each stage, however they are possible when searching for unique files. let (groups_to_fclones, groups_to_pass): (Vec<_>, Vec<_>) = groups.into_iter().partition(group_pre_filter); // This way we can split processing to separate thread-pools, one per device: let files = partition_by_devices(groups_to_fclones, devices); let mut hash_map = GroupMap::new(|f: HashedFileInfo| ((f.file_info.len, f.file_hash), f.file_info)); let hash_map_ref = &mut hash_map; // Scope needed so threads can access shared stuff like groups or shared functions. // The threads we launch are guaranteed to not live longer than this scope. thread::scope(move |s| { // Process all files in background for (mut files, device) in files.into_iter().zip(devices.iter()) { if files.is_empty() { continue; } let tx = tx.clone(); // Launch a separate thread for each device, so we can process // files on each device independently s.spawn(move |_| { // Sort files by their physical location, to reduce disk seek latency on HDD. // Additionally, files with the same id end up directly // next to each other so we can avoid rehashing the same files. files.par_sort_unstable_by_key(|f| f.file_info.location); // Some devices like HDDs may benefit from different amount of parallelism // depending on the access type. Therefore we chose a thread pool appropriate // for the access type let thread_pool = match access_type { FileAccess::Sequential => device.seq_thread_pool(), FileAccess::Random => device.rand_thread_pool(), }; let thread_count = thread_pool.current_num_threads() as isize; // Limit the number of tasks spawned at once into the thread-pool. // Each task creates a heap allocation and reserves memory in the queue. // It is more memory efficient to keep these tasks as long as possible // in our vector. Without this limit we observed over 20% more memory use // when processing 1M of files. let semaphore = Arc::new(Semaphore::new(8 * thread_count)); // Run hashing on the thread-pool dedicated to the device. // Group files by their identifiers so we hash only one file per unique id. for (_, fg) in &files.into_iter().group_by(|f| f.file_info.id) { let mut fg = fg.collect_vec(); let tx = tx.clone(); let guard = semaphore.clone().access_owned(); // Spawning a task into a thread-pool requires a static lifetime, // because generally the task could outlive caller's stack frame. // However, this is not the case for rehash function, because // we don't exit before all tasks are closed. // In the perfect world we should use scopes for that. Unfortunately // the current implementation of rayon scopes runs the scope body // on one of the thread-pool worker threads, so it is not possible // to safely block inside the scope, because that leads to deadlock // when the pool has only one thread. let hash_fn: &HashFn<'static> = unsafe { std::mem::transmute(hash_fn) }; thread_pool.spawn_fifo(move || { let _open_files_guard = RLIMIT_OPEN_FILES.clone().access_owned(); let old_hash = fg[0].file_hash.clone(); if let Some(hash) = hash_fn((&mut fg[0].file_info, old_hash)) { for mut f in fg { f.file_hash = hash.clone(); tx.send(f).unwrap(); } } // This forces moving the guard into this task and be released when // the task is done drop(guard); }); } }); } // Drop the original tx, so all tx are closed when the threads finish and // the next while loop will eventually exit drop(tx); // Collect the results from all threads and group them. // Note that this will happen as soon as data are available while let Ok(hashed_file) = rx.recv() { hash_map_ref.add(hashed_file); } }) .unwrap(); // Convert the hashmap into vector, leaving only large-enough groups: hash_map .into_iter() .map(|((len, hash), files)| FileGroup { file_len: len, file_hash: hash, files: files.to_vec(), }) .chain(groups_to_pass) .filter(group_post_filter) .collect() } /// Walks the directory tree and collects matching files in parallel into a vector fn scan_files(ctx: &GroupCtx<'_>) -> Vec<Vec<FileInfo>> { let file_collector = ThreadLocal::new(); let file_count = AtomicUsize::new(0); let spinner = ctx .log .progress_bar(&ctx.phases.format(Phase::Walk), ProgressBarLength::Unknown); let spinner_tick = &|_: &Path| { file_count.fetch_add(1, Ordering::Relaxed); spinner.inc(1); }; let config = &ctx.config; let min_size = config.min_size; let max_size = config.max_size.unwrap_or(FileLen::MAX); let mut walk = Walk::new(); walk.depth = config.depth.unwrap_or(usize::MAX); walk.hidden = config.hidden; walk.follow_links = config.follow_links; walk.report_links = config.symbolic_links; walk.no_ignore = config.no_ignore; walk.one_fs = config.one_fs; walk.path_selector = ctx.path_selector.clone(); walk.log = Some(ctx.log); walk.on_visit = spinner_tick; walk.run(ctx.config.input_paths(), |path| { file_info_or_log_err(path, &ctx.devices, ctx.log) .into_iter() .filter(|info| { let l = info.len; l >= min_size && l <= max_size }) .for_each(|info| { let vec = file_collector.get_or(|| RefCell::new(Vec::new())); vec.borrow_mut().push(info); }); }); ctx.log.info(format!( "Scanned {} file entries", file_count.load(Ordering::Relaxed) )); let files: Vec<_> = file_collector.into_iter().map(|r| r.into_inner()).collect(); let file_count: usize = files.iter().map(|v| v.len()).sum(); let total_size: u64 = files.iter().flat_map(|v| v.iter().map(|i| i.len.0)).sum(); ctx.log.info(format!( "Found {} ({}) files matching selection criteria", file_count, FileLen(total_size) )); files } /// Returns the sum of number of files in all groups fn file_count<'a, T: 'a>(groups: impl IntoIterator<Item = &'a FileGroup<T>>) -> usize { groups.into_iter().map(|g| g.file_count()).sum() } /// Returns the sum of sizes of files in all groups, including duplicates fn total_size<'a, T: 'a>(groups: impl IntoIterator<Item = &'a FileGroup<T>>) -> FileLen { groups.into_iter().map(|g| g.total_size()).sum() } /// Returns the sum of number of files in all groups fn unique_file_count<'a, T>(groups: impl IntoIterator<Item = &'a FileGroup<T>>) -> usize where T: AsRef<FileId> + 'a, { groups.into_iter().map(|g| g.unique_count()).sum() } /// Returns the sum of sizes of files in all groups, including duplicates fn unique_file_size<'a, T: 'a>(groups: impl IntoIterator<Item = &'a FileGroup<T>>) -> FileLen where T: AsRef<FileId> + 'a, { groups.into_iter().map(|g| g.unique_size()).sum() } /// Sorts each file group by file identifiers fn sort_files_by_id<'a, T: 'a>(groups: impl IntoIterator<Item = &'a mut FileGroup<T>>) where T: AsRef<FileId> + 'a, { for g in groups.into_iter() { g.sort_by_id() } } /// Returns an estimation of the number of files matching the search criteria fn stage_stats(groups: &[FileGroup<FileInfo>], filter: &FileGroupFilter) -> (usize, FileLen) { let mut total_count = 0; let mut total_size = FileLen(0); for g in groups { let count = g.reported_count(filter); let size = g.file_len * count as u64; total_count += count; total_size += size; } (total_count, total_size) } fn group_by_size(ctx: &GroupCtx<'_>, files: Vec<Vec<FileInfo>>) -> Vec<FileGroup<FileInfo>> { let file_count: usize = files.iter().map(|v| v.len()).sum(); let progress = ctx.log.progress_bar( &ctx.phases.format(Phase::GroupBySize), ProgressBarLength::Items(file_count as u64), ); let mut groups = GroupMap::new(|info: FileInfo| (info.len, info)); for files in files.into_iter() { for file in files.into_iter() { progress.inc(1); groups.add(file); } } let groups: Vec<_> = groups .into_iter() .map(|(l, files)| FileGroup { file_len: l, file_hash: FileHash::from(0), files: files.into_vec(), }) .filter(|g| g.matches(&ctx.group_filter)) .collect(); let stats = stage_stats(&groups, &ctx.group_filter); ctx.log.info(format!( "Found {} ({}) candidates after grouping by size", stats.0, stats.1 )); groups } /// Removes files with duplicate path names. fn deduplicate<F>(files: &mut Vec<FileInfo>, progress: F) where F: Fn(&Path) + Sync + Send, { let mut groups = GroupMap::new(|fi: FileInfo| (fi.location, fi)); for f in files.drain(..) { groups.add(f) } for (_, file_group) in groups.into_iter() { if file_group.len() == 1 { files.extend(file_group.into_iter().inspect(|p| progress(&p.path))); } else { files.extend( file_group .into_iter() .inspect(|p| progress(&p.path)) .unique_by(|p| p.path.hash128()), ) } } } fn remove_same_files( ctx: &GroupCtx<'_>, groups: Vec<FileGroup<FileInfo>>, ) -> Vec<FileGroup<FileInfo>> { let groups: Vec<_> = groups .into_par_iter() .update(|g| deduplicate(&mut g.files, |_| {})) .filter(|g| g.matches(&ctx.group_filter)) .collect(); let stats = stage_stats(&groups, &ctx.group_filter); ctx.log.info(format!( "Found {} ({}) candidates after grouping by paths", stats.0, stats.1, )); groups } #[cfg(target_os = "linux")] fn atomic_counter_vec(len: usize) -> Vec<std::sync::atomic::AtomicU32> { let mut v = Vec::with_capacity(len); for _ in 0..len { v.push(std::sync::atomic::AtomicU32::new(0)); } v } #[cfg(target_os = "linux")] fn update_file_locations(ctx: &GroupCtx<'_>, groups: &mut (impl FileCollection + ?Sized)) { let count = groups.count(); let progress = ctx .log .progress_bar("Fetching extents", ProgressBarLength::Items(count as u64)); let err_counters = atomic_counter_vec(ctx.devices.len()); groups.for_each_mut(|fi| { let device: &DiskDevice = &ctx.devices[fi.get_device_index()]; if device.disk_kind != sysinfo::DiskKind::SSD { if let Err(e) = fi.fetch_physical_location() { // Do not print a notice about slower access when fetching file extents has // failed because a file vanished -- now it will never be accessed anyhow. const ENOENT_NO_SUCH_FILE: i32 = 2; if e.raw_os_error() .map_or(true, |err| err != ENOENT_NO_SUCH_FILE) { handle_fetch_physical_location_err(ctx, &err_counters, fi, e) } } } progress.inc(1) }); } #[cfg(not(target_os = "linux"))] fn update_file_locations(_ctx: &GroupCtx<'_>, _groups: &mut (impl FileCollection + ?Sized)) {} /// Displays a warning message after fiemap ioctl fails and we don't know where the /// file data are located. /// The `err_counters` array is used to keep track of the number of errors recorded so far for /// given device - this array must contain the same number of entries as there are devices. /// If there are too many errors, subsequent warnings for the device are suppressed. #[cfg(target_os = "linux")] fn handle_fetch_physical_location_err( ctx: &GroupCtx<'_>, err_counters: &[std::sync::atomic::AtomicU32], file_info: &FileInfo, error: io::Error, ) { const MAX_ERR_COUNT_TO_LOG: u32 = 10; let device = &ctx.devices[file_info.get_device_index()]; let counter = &err_counters[device.index]; if crate::error::error_kind(&error) == io::ErrorKind::Unsupported { if counter.swap(MAX_ERR_COUNT_TO_LOG, Ordering::Release) < MAX_ERR_COUNT_TO_LOG { ctx.log.warn(format!( "File system {} on device {} doesn't support FIEMAP ioctl API. \ This is generally harmless, but random access performance might be decreased \ because fclones can't determine physical on-disk location of file data needed \ for reading files in the optimal order.", device.file_system, device.name.to_string_lossy() )); } } else if counter.load(Ordering::Acquire) < MAX_ERR_COUNT_TO_LOG { ctx.log.warn(format!( "Failed to fetch file extents mapping for file {}: {}. \ This is generally harmless, but it might decrease random access performance.", file_info.path.display(), error )); let err_count = counter.fetch_add(1, Ordering::AcqRel); if err_count == MAX_ERR_COUNT_TO_LOG { ctx.log.warn(format!( "Too many errors trying to fetch file extent mappings on device {}. \ Subsequent errors for this device will be ignored.", device.name.to_string_lossy() )) } } } /// Transforms files by piping them to an external program and groups them by their hashes fn group_transformed(ctx: &GroupCtx<'_>, files: Vec<FileInfo>) -> Vec<FileGroup<FileInfo>> { let mut files = files; files.par_sort_unstable_by_key(|f| FileId::of(f)); // need to sort so we know unique_file_count let groups = vec![FileGroup { file_len: FileLen(0), // doesn't matter, will be computed file_hash: FileHash::from(0), // doesn't matter, will be computed files, }]; let progress = ctx.log.progress_bar( &ctx.phases.format(Phase::TransformAndGroup), ProgressBarLength::Items(unique_file_count(&groups) as u64), ); let groups = rehash( groups, |_| true, |g| g.matches(&ctx.group_filter), &ctx.devices, FileAccess::Sequential, |(fi, _)| { let chunk = FileChunk::new(&fi.path, FilePos(0), fi.len); let result = ctx.hasher .hash_transformed_or_log_err(&chunk, |_| {}) .map(|(len, hash)| { fi.len = len; hash }); progress.inc(1); result }, ); let stats = stage_stats(&groups, &ctx.group_filter); ctx.log.info(format!( "Found {} ({}) {} files", stats.0, stats.1, ctx.config.search_type() )); groups } /// Returns the maximum value of the given property of the device, /// among the devices actually used to store any of the given files fn max_device_property<'a>( devices: &DiskDevices, files: impl ParallelIterator<Item = &'a FileInfo>, property_fn: impl Fn(&DiskDevice) -> FileLen + Sync, ) -> FileLen { files .into_par_iter() .map(|f| property_fn(&devices[f.get_device_index()])) .max() .unwrap_or_else(|| property_fn(devices.get_default())) } /// Returns the desired prefix length for a group of files. /// The return value depends on the capabilities of the devices the files are stored on. /// Higher values are desired if any of the files resides on an HDD. fn prefix_len<'a>( partitions: &DiskDevices, files: impl ParallelIterator<Item = &'a FileInfo>, ) -> FileLen { max_device_property(partitions, files, |dd| dd.max_prefix_len()) } /// Groups files by a hash of their first few thousand bytes. fn group_by_prefix( ctx: &GroupCtx<'_>, prefix_len: FileLen, groups: Vec<FileGroup<FileInfo>>, ) -> Vec<FileGroup<FileInfo>> { let mut groups = groups; sort_files_by_id(&mut groups); let pre_filter = |g: &FileGroup<FileInfo>| g.unique_count() > 1; let file_count = unique_file_count(groups.iter().filter(|g| pre_filter(g))); let progress = ctx.log.progress_bar( &ctx.phases.format(Phase::GroupByPrefix), ProgressBarLength::Items(file_count as u64), ); let groups = rehash( groups, pre_filter, |g| g.matches(&ctx.group_filter), &ctx.devices, FileAccess::Random, |(fi, _)| { progress.inc(1); let prefix_len = if fi.len <= prefix_len { prefix_len } else { ctx.devices[fi.get_device_index()].min_prefix_len() }; let chunk = FileChunk::new(&fi.path, FilePos(0), prefix_len); ctx.hasher.hash_file_or_log_err(&chunk, |_| {}) }, ); let stats = stage_stats(&groups, &ctx.group_filter); ctx.log.info(format!( "Found {} ({}) candidates after grouping by prefix", stats.0, stats.1 )); groups } /// Returns the desired suffix length for a group of files. /// The return value depends on the capabilities of the devices the files are stored on. /// Higher values are desired if any of the files resides on an HDD. fn suffix_len<'a>( partitions: &DiskDevices, files: impl ParallelIterator<Item = &'a FileInfo>, ) -> FileLen { max_device_property(partitions, files, |dd| dd.suffix_len()) } fn suffix_threshold<'a>( partitions: &DiskDevices, files: impl ParallelIterator<Item = &'a FileInfo>, ) -> FileLen { max_device_property(partitions, files, |dd| dd.suffix_threshold()) } fn group_by_suffix( ctx: &GroupCtx<'_>, groups: Vec<FileGroup<FileInfo>>, ) -> Vec<FileGroup<FileInfo>> { let mut groups = groups; sort_files_by_id(&mut groups); let suffix_len = ctx .config .max_suffix_size .unwrap_or_else(|| suffix_len(&ctx.devices, flat_iter(&groups))); let suffix_threshold = suffix_threshold(&ctx.devices, flat_iter(&groups)); let pre_filter = |g: &FileGroup<FileInfo>| g.file_len >= suffix_threshold && g.unique_count() > 1; let file_count = unique_file_count(groups.iter().filter(|g| pre_filter(g))); let progress = ctx.log.progress_bar( &ctx.phases.format(Phase::GroupBySuffix), ProgressBarLength::Items(file_count as u64), ); let groups = rehash( groups, pre_filter, |g| g.matches(&ctx.group_filter), &ctx.devices, FileAccess::Random, |(fi, old_hash)| { progress.inc(1); let chunk = FileChunk::new(&fi.path, fi.len.as_pos() - suffix_len, suffix_len); ctx.hasher .hash_file_or_log_err(&chunk, |_| {}) .map(|new_hash| old_hash ^ new_hash) }, ); let stats = stage_stats(&groups, &ctx.group_filter); ctx.log.info(format!( "Found {} ({}) candidates after grouping by suffix", stats.0, stats.1 )); groups } fn group_by_contents( ctx: &GroupCtx<'_>, min_file_len: FileLen, groups: Vec<FileGroup<FileInfo>>, ) -> Vec<FileGroup<FileInfo>> { let mut groups = groups; sort_files_by_id(&mut groups); let pre_filter = |g: &FileGroup<FileInfo>| g.unique_count() > 1 && g.file_len >= min_file_len; let bytes_to_scan = unique_file_size(groups.iter().filter(|g| pre_filter(g))); let progress = &ctx.log.progress_bar( &ctx.phases.format(Phase::GroupByContents), ProgressBarLength::Bytes(bytes_to_scan.0), ); let groups = rehash( groups, pre_filter, |g| g.matches_strictly(&ctx.group_filter), &ctx.devices, FileAccess::Sequential, |(fi, _)| { let chunk = FileChunk::new(&fi.path, FilePos(0), fi.len); ctx.hasher .hash_file_or_log_err(&chunk, |bytes_read| progress.inc(bytes_read as u64)) }, ); let stats = stage_stats(&groups, &ctx.group_filter); ctx.log.info(format!( "Found {} ({}) {} files", stats.0, stats.1, ctx.config.search_type() )); groups } /// Groups identical files together by 128-bit hash of their contents. /// Depending on filtering settings, can find unique, duplicate, over- or under-replicated files. /// /// # Input /// The input set of files or paths to scan should be given in the `config.paths` property. /// When `config.recursive` is set to true, the search descends into /// subdirectories recursively (default is false). /// /// # Output /// Returns a vector of groups of absolute paths. /// Each group of files has a common hash and length. /// Groups are sorted descending by file size. /// /// # Errors /// An error is returned immediately if the configuration is invalid. /// I/O errors during processing are logged as warnings and unreadable files are skipped. /// If panics happen they are likely a result of a bug and should be reported. /// /// # Performance characteristics /// The worst-case running time to is roughly proportional to the time required to /// open and read all files. Depending on the number of matching files and parameters of the /// query, that time can be lower because some files can be skipped from some stages of processing. /// The expected memory utilisation is roughly proportional the number of files and /// their path lengths. /// /// # Threading /// This function blocks caller's thread until all files are processed. /// To speed up processing, it spawns multiple threads internally. /// Some processing is performed on the default Rayon thread pool, therefore this function /// must not be called on Rayon thread pool to avoid a deadlock. /// The parallelism level is automatically set based on the type of storage and can be overridden /// in the configuration. /// /// # Algorithm /// Files are grouped in multiple stages and filtered after each stage. /// Files that turn out to be unique at some point are skipped from further stages. /// Stages are ordered by increasing I/O cost. On rotational drives, /// an attempt is made to sort files by physical data location before each grouping stage /// to reduce disk seek times. /// /// 1. Create a list of files to process by walking directory tree if recursive mode selected. /// 2. Get length and identifier of each file. /// 3. Group files by length. /// 4. In each group, remove duplicate files with the same identifier. /// 5. Group files by hash of the prefix. /// 6. Group files by hash of the suffix. /// 7. Group files by hash of their full contents. /// /// # Example /// ``` /// use fclones::log::StdLog; /// use fclones::config::GroupConfig; /// use fclones::Path; /// use fclones::{group_files, write_report}; /// /// let log = StdLog::new(); /// let mut config = GroupConfig::default(); /// config.paths = vec![Path::from("/path/to/a/dir")]; /// /// let groups = group_files(&config, &log).unwrap(); /// println!("Found {} groups: ", groups.len()); /// /// // print standard fclones report to stdout: /// write_report(&config, &log, &groups).unwrap(); /// ``` pub fn group_files(config: &GroupConfig, log: &dyn Log) -> Result<Vec<FileGroup<FileInfo>>, Error> { let spinner = log.progress_bar("Initializing", ProgressBarLength::Unknown); let ctx = GroupCtx::new(config, log)?; drop(spinner); let matching_files = scan_files(&ctx); let mut groups = match &ctx.hasher.transform { Some(_transform) => { let mut files = matching_files.into_iter().flatten().collect_vec(); deduplicate(&mut files, |_| {}); update_file_locations(&ctx, &mut files); group_transformed(&ctx, files) } _ => { let size_groups = group_by_size(&ctx, matching_files); let mut size_groups_pruned = remove_same_files(&ctx, size_groups); update_file_locations(&ctx, &mut size_groups_pruned); let prefix_len = ctx .config .max_prefix_size .unwrap_or_else(|| prefix_len(&ctx.devices, flat_iter(&size_groups_pruned))); let prefix_groups = group_by_prefix(&ctx, prefix_len, size_groups_pruned); let suffix_groups = group_by_suffix(&ctx, prefix_groups); if !ctx.config.skip_content_hash { group_by_contents(&ctx, prefix_len, suffix_groups) } else { suffix_groups } } }; groups.par_sort_by_key(|g| Reverse((g.file_len, g.file_hash.u128_prefix()))); groups .par_iter_mut() .for_each(|g| g.sort_by_path(&ctx.group_filter.root_paths)); Ok(groups) } /// Writes the list of groups to a file or the standard output. /// /// # Parameters /// - `config.output`: a path to the output file, `None` for standard output /// - `config.format`: selects the format of the output, see [`config::OutputFormat`] /// - `log`: used for drawing a progress bar to standard error /// - `groups`: list of groups of files to print, e.g. obtained from [`group_files`] /// /// # Errors /// Returns [`io::Error`] on I/O write error or if the output file cannot be created. pub fn write_report( config: &GroupConfig, log: &dyn Log, groups: &[FileGroup<FileInfo>], ) -> io::Result<()> { let now = Local::now(); let total_count = file_count(groups.iter()); let total_size = total_size(groups.iter()); let (redundant_count, redundant_size) = groups.iter().fold((0, FileLen(0)), |res, g| { let count = g.redundant_count(&config.group_filter()); (res.0 + count, res.1 + g.file_len * count as u64) }); let (missing_count, missing_size) = groups.iter().fold((0, FileLen(0)), |res, g| { let count = g.missing_count(&config.group_filter()); (res.0 + count, res.1 + g.file_len * count as u64) }); let header = ReportHeader { timestamp: DateTime::from_naive_utc_and_offset(now.naive_utc(), *now.offset()), version: env!("CARGO_PKG_VERSION").to_owned(), command: args_os().map(Arg::from).collect(), base_dir: config.base_dir.clone(), stats: Some(FileStats { group_count: groups.len(), total_file_count: total_count, total_file_size: total_size, redundant_file_count: redundant_count, redundant_file_size: redundant_size, missing_file_count: missing_count, missing_file_size: missing_size, }), }; match &config.output { Some(path) => { let progress = log.progress_bar( "Writing report", ProgressBarLength::Items(groups.len() as u64), ); let iter = groups.iter().inspect(|_g| progress.inc(1)); let file = BufWriter::new(File::create(path)?); let mut reporter = ReportWriter::new(file, false); reporter.write(config.format, &header, iter) } None => { let term = Term::stdout(); let color = term.is_term(); let mut reporter = ReportWriter::new(BufWriter::new(term), color); reporter.write(config.format, &header, groups.iter()) } } } #[cfg(test)] mod test { use std::fs::{create_dir, hard_link, File, OpenOptions}; use std::io::{Read, Write}; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Mutex; use crate::log::StdLog; use rand::seq::SliceRandom; use sysinfo::DiskKind; use crate::path::Path; use crate::util::test::*; use super::*; const MAX_PREFIX_LEN: usize = 256 * 1024; const MAX_SUFFIX_LEN: usize = 256 * 1024; #[test] fn items_should_be_split_into_groups() { use super::GroupMap; use smallvec::SmallVec; let mut map = GroupMap::new(|item: (u32, u32)| (item.0, item.1)); map.add((1, 10)); map.add((2, 20)); map.add((1, 11)); map.add((2, 21)); let mut groups: Vec<_> = map.into_iter().collect(); groups.sort_by_key(|item| item.0); assert_eq!(groups[0], (1, SmallVec::from_vec(vec![10, 11]))); assert_eq!(groups[1], (2, SmallVec::from_vec(vec![20, 21]))); } /// Files hashing to different values should be placed into different groups #[test] fn test_rehash_puts_files_with_different_hashes_to_different_groups() { let devices = DiskDevices::default(); let input = vec![FileGroup { file_len: FileLen(200), file_hash: FileHash::from(0), files: vec![ FileInfo { id: FileId { device: 1, inode: 1, }, len: FileLen(200), location: 0, path: Path::from("file1"), }, FileInfo { id: FileId { device: 1, inode: 2, }, len: FileLen(200), location: 35847587, path: Path::from("file2"), }, ], }]; let result = rehash( input, |_| true, |_| true, &devices, FileAccess::Random, |(fi, _)| Some(FileHash::from(fi.location as u128)), ); assert_eq!(result.len(), 2); assert_eq!(result[0].files.len(), 1); assert_eq!(result[1].files.len(), 1); assert_ne!(result[0].files[0].path, result[1].files[0].path); } #[test] fn test_rehash_doesnt_hash_files_with_same_id_more_than_once() { let devices = DiskDevices::default(); let input = vec![FileGroup { file_len: FileLen(200), file_hash: FileHash::from(0), files: vec![ FileInfo { id: FileId { device: 1, inode: 1, }, len: FileLen(200), location: 0, path: Path::from("file1"), }, FileInfo { id: FileId { device: 1, inode: 1, }, len: FileLen(200), location: 0, path: Path::from("file2"), }, ], }]; let hash_call_count = AtomicUsize::new(0); let result = rehash( input, |_| true, |_| true, &devices, FileAccess::Random, |(fi, _)| { hash_call_count.fetch_add(1, Ordering::Relaxed); Some(FileHash::from(fi.location as u128)) }, ); assert_eq!(result.len(), 1); assert_eq!(result[0].files.len(), 2); assert_ne!(result[0].files[0].path, result[0].files[1].path); assert_eq!(hash_call_count.load(Ordering::Relaxed), 1); } /// Files hashing to same values should be placed into the same groups #[test] fn test_rehash_puts_files_with_same_hashes_to_same_groups() { let devices = DiskDevices::default(); let input = vec![ FileGroup { file_len: FileLen(200), file_hash: FileHash::from(0), files: vec![FileInfo { id: FileId { device: 1, inode: 1, }, len: FileLen(200), location: 0, path: Path::from("file1"), }], }, FileGroup { file_len: FileLen(500), file_hash: FileHash::from(0), files: vec![FileInfo { id: FileId { device: 1, inode: 2, }, len: FileLen(200), location: 35847587, path: Path::from("file2"), }], }, ]; let result = rehash( input, |_| true, |_| true, &devices, FileAccess::Random, |(_, _)| Some(FileHash::from(123456)), ); assert_eq!(result.len(), 1); assert_eq!(result[0].files.len(), 2); } #[test] fn test_rehash_can_skip_processing_files() { let devices = DiskDevices::default(); let input = vec![FileGroup { file_len: FileLen(200), file_hash: FileHash::from(0), files: vec![FileInfo { id: FileId { device: 1, inode: 1, }, len: FileLen(200), location: 0, path: Path::from("file1"), }], }]; let called = AtomicBool::new(false); let result = rehash( input, |_| false, |_| true, &devices, FileAccess::Random, |(fi, _)| { called.store(true, Ordering::Release); Some(FileHash::from(fi.location as u128)) }, ); assert_eq!(result.len(), 1); assert!(!called.load(Ordering::Acquire)); } #[test] fn test_rehash_post_filter_removes_groups() { let devices = DiskDevices::default(); let input = vec![FileGroup { file_len: FileLen(200), file_hash: FileHash::from(0), files: vec![ FileInfo { id: FileId { device: 1, inode: 1, }, len: FileLen(200), location: 0, path: Path::from("file1"), }, FileInfo { id: FileId { device: 1, inode: 2, }, len: FileLen(200), location: 35847587, path: Path::from("file2"), }, ], }]; let result = rehash( input, |_| true, |g| g.files.len() >= 2, &devices, FileAccess::Random, |(fi, _)| Some(FileHash::from(fi.location as u128)), ); assert!(result.is_empty()) } #[test] fn test_rehash_processes_files_in_location_order_on_hdd() { let thread_count = 2; let devices = DiskDevices::single(DiskKind::HDD, thread_count); let count = 1000; let mut input = Vec::with_capacity(count); for i in 0..count { input.push(FileGroup { file_len: FileLen(0), file_hash: FileHash::from(0), files: vec![FileInfo { id: FileId { device: 1, inode: i as InodeId, }, len: FileLen(0), location: i as u64, path: Path::from(format!("file{i}")), }], }) } input.shuffle(&mut rand::thread_rng()); let processing_order = Mutex::new(Vec::new()); rehash( input, |_| true, |_| true, &devices, FileAccess::Random, |(fi, _)| { processing_order.lock().unwrap().push(fi.location as i32); Some(FileHash::from(fi.location as u128)) }, ); let processing_order = processing_order.into_inner().unwrap(); // Because we're processing files in parallel, we have no strict guarantee they // will be processed in the exact same order as in the input. // However, we expect some locality so the total distance between subsequent accesses // is low. let mut distance = 0; for i in 0..processing_order.len() - 1 { distance += i32::abs(processing_order[i] - processing_order[i + 1]) } assert!(distance < (thread_count * count) as i32) } #[test] fn identical_small_files() { with_dir("main/identical_small_files", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); write_test_file(&file1, b"aaa", b"", b""); write_test_file(&file2, b"aaa", b"", b""); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].file_len, FileLen(3)); assert_eq!(results[0].files.len(), 2); }); } #[test] fn identical_large_files() { with_dir("main/identical_large_files", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); write_test_file(&file1, &[0; MAX_PREFIX_LEN], &[1; 4096], &[2; 4096]); write_test_file(&file2, &[0; MAX_PREFIX_LEN], &[1; 4096], &[2; 4096]); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 2); }); } #[test] fn files_differing_by_size() { with_dir("main/files_differing_by_size", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); write_test_file(&file1, b"aaaa", b"", b""); write_test_file(&file2, b"aaa", b"", b""); let file1 = Path::from(file1); let file2 = Path::from(file2); let log = test_log(); let config = GroupConfig { paths: vec![file1.clone(), file2.clone()], rf_over: Some(0), ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 2); assert_eq!(results[0].paths(), vec![file1.canonicalize()]); assert_eq!(results[1].paths(), vec![file2.canonicalize()]); }); } #[test] fn files_differing_by_prefix() { with_dir("main/files_differing_by_prefix", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); write_test_file(&file1, b"aaa", b"", b""); write_test_file(&file2, b"bbb", b"", b""); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], unique: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 2); assert_eq!(results[0].files.len(), 1); assert_eq!(results[1].files.len(), 1); }); } #[test] fn files_differing_by_suffix() { with_dir("main/files_differing_by_suffix", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); let prefix = [0; MAX_PREFIX_LEN]; let mid = [1; MAX_PREFIX_LEN + MAX_SUFFIX_LEN]; write_test_file(&file1, &prefix, &mid, b"suffix1"); write_test_file(&file2, &prefix, &mid, b"suffix2"); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], unique: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 2); assert_eq!(results[0].files.len(), 1); assert_eq!(results[1].files.len(), 1); }); } #[test] fn files_differing_by_middle() { with_dir("main/files_differing_by_middle", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); let prefix = [0; MAX_PREFIX_LEN]; let suffix = [1; MAX_SUFFIX_LEN]; write_test_file(&file1, &prefix, b"middle1", &suffix); write_test_file(&file2, &prefix, b"middle2", &suffix); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], unique: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 2); assert_eq!(results[0].files.len(), 1); assert_eq!(results[1].files.len(), 1); }); } #[test] fn hard_links() { with_dir("main/hard_links", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); write_test_file(&file1, b"aaa", b"", b""); hard_link(&file1, &file2).unwrap(); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], unique: true, // hardlinks to a common file should be treated as one file ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 2); }); } #[test] #[cfg(unix)] fn report_symbolic_links_to_files() { with_dir("main/soft_links", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); write_test_file(&file1, b"aaa", b"", b""); std::os::unix::fs::symlink(&file1, &file2).unwrap(); let log = test_log(); let mut config = GroupConfig { paths: vec![file1.into(), file2.into()], // If both hard_links and symbolic_links is set to true, symbolic links should // be treated as duplicates. match_links: true, symbolic_links: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 2); // Symbolic links should be totally ignored: config.symbolic_links = false; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 0); // If hard_links is set to false and symbolic_links to true, // a symlink to a file should be reported, but not treated as a duplicate: config.unique = true; config.symbolic_links = true; config.match_links = false; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 2); }); } #[test] fn duplicate_input_files() { with_dir("main/duplicate_input_files", |root| { let file1 = root.join("file1"); write_test_file(&file1, b"foo", b"", b""); let log = test_log(); let file1 = Path::from(file1); let config = GroupConfig { paths: vec![file1.clone(), file1.clone(), file1], match_links: true, unique: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 1); }); } #[test] #[cfg(unix)] fn duplicate_input_files_non_canonical() { use std::os::unix::fs::symlink; with_dir("main/duplicate_input_files_non_canonical", |root| { let dir = root.join("dir"); symlink(root, dir).unwrap(); let file1 = root.join("file1"); let file2 = root.join("dir/file1"); write_test_file(&file1, b"foo", b"", b""); let log = test_log(); let config = GroupConfig { paths: vec![file1.into(), file2.into()], match_links: true, unique: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 1); }); } #[test] fn duplicate_files_different_roots() { with_dir("main/duplicate_files_different_roots", |root| { let root1 = root.join("root1"); let root2 = root.join("root2"); create_dir(&root1).unwrap(); create_dir(&root2).unwrap(); let file1 = root1.join("file1"); let file2 = root1.join("file2"); write_test_file(&file1, b"foo", b"", b""); write_test_file(&file2, b"foo", b"", b""); let log = test_log(); let mut config = GroupConfig { paths: vec![root1.into(), root2.into()], isolate: true, ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 0); config.isolate = false; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 2); }); } #[test] #[cfg(unix)] fn transformed_truncated() { with_dir("target/test/group/transform/truncate/", |root| { let input_path_1 = root.join("input1.txt"); let input_path_2 = root.join("input2.txt"); // the files are different and have different lengths, but their initial // 2 bytes are the same write_file(&input_path_1, "aa|1"); write_file(&input_path_2, "aa|23456"); let log = test_log(); let config = GroupConfig { paths: vec![input_path_1.into(), input_path_2.into()], // a transform that takes only the first two bytes of each file transform: Some("dd count=2 bs=1".to_string()), ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files.len(), 2); }) } #[test] fn unique_files() { with_dir("main/unique_files", |root| { let file1 = root.join("file1"); let file2 = root.join("file2"); let file3 = root.join("file3"); write_test_file(&file1, b"duplicate", b"", b""); write_test_file(&file2, b"duplicate", b"", b""); write_test_file(&file3, b"unique", b"", b""); let file3 = Path::from(file3); let log = test_log(); let config = GroupConfig { unique: true, paths: vec![file1.into(), file2.into(), file3.clone()], ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); assert_eq!(results.len(), 1); assert_eq!(results[0].files[0].path, file3); }); } #[test] fn report() { with_dir("main/report", |root| { let file = root.join("file1"); write_test_file(&file, b"foo", b"", b""); let report_file = root.join("report.txt"); let log = test_log(); let config = GroupConfig { paths: vec![file.into()], unique: true, output: Some(report_file.clone()), ..GroupConfig::default() }; let results = group_files(&config, &log).unwrap(); write_report(&config, &log, &results).unwrap(); assert!(report_file.exists()); let mut report = String::new(); File::open(report_file) .unwrap() .read_to_string(&mut report) .unwrap(); assert!(report.contains("file1")) }); } #[test] fn split_to_subgroups() { fn file(path: &str, id: InodeId) -> FileInfo { FileInfo { path: Path::from(path), id: FileId { inode: id, device: 0, }, len: FileLen(1024), location: id * 1024, } } let roots = vec![Path::from("/r0"), Path::from("/r1"), Path::from("/r2")]; let files = vec![ file("/r1/f1a", 0), file("/r2/f2a", 1), file("/r2/f2b", 2), file("/r1/f1b", 3), file("/r1/f1c", 4), file("/r3/f3a", 5), file("/r2/f2c", 6), ]; let groups = FileSubGroup::group(files, &roots, true); assert_eq!( groups, vec![ FileSubGroup { files: vec![file("/r1/f1a", 0), file("/r1/f1b", 3), file("/r1/f1c", 4),] }, FileSubGroup { files: vec![file("/r2/f2a", 1), file("/r2/f2b", 2), file("/r2/f2c", 6)] }, FileSubGroup { files: vec![file("/r3/f3a", 5)] } ] ) } #[test] fn partition() { let fg = FileGroup { file_len: FileLen::from(1u64), file_hash: FileHash::from(1u128), files: vec!["a1", "b1", "a2", "b2", "b3"], }; let mut partitions = fg.partition_by_key(|f| f.chars().next().unwrap()); assert_eq!(partitions.len(), 2); partitions.sort_by_key(|p| p.files.len()); assert_eq!(partitions[0].files, vec!["a1", "a2"]); assert_eq!(partitions[1].files, vec!["b1", "b2", "b3"]); } #[test] fn map() { let fg = FileGroup { file_len: FileLen::from(1u64), file_hash: FileHash::from(1u128), files: vec!["a", "b"], }; let fg = fg.map(|f| format!("{f}.txt")); assert_eq!(fg.files, vec![String::from("a.txt"), String::from("b.txt")]); } #[test] fn try_map_all_happy_path() { let fg = FileGroup { file_len: FileLen::from(1u64), file_hash: FileHash::from(1u128), files: vec!["a", "b"], }; let fg = fg.try_map_all(|f| Result::<_, ()>::Ok(format!("{f}.txt"))); assert!(fg.is_ok()); assert_eq!( fg.unwrap().files, vec![String::from("a.txt"), String::from("b.txt")] ); } #[test] fn try_map_all_errors() { let fg = FileGroup { file_len: FileLen::from(1u64), file_hash: FileHash::from(1u128), files: vec!["a", "b"], }; let fg = fg.try_map_all(|f| Result::<(), _>::Err(format!("error {f}"))); assert!(fg.is_err()); assert_eq!( fg.unwrap_err(), vec![String::from("error a"), String::from("error b")] ); } #[test] fn flat_map() { let fg = FileGroup { file_len: FileLen::from(1u64), file_hash: FileHash::from(1u128), files: vec!["a1", "b1", "a2", "b2", "b3"], }; let fg = fg.flat_map(|f| if f.starts_with('a') { Some(f) } else { None }); assert_eq!(fg.files, vec!["a1", "a2"]); let fg = fg.flat_map(|f| vec![f, f]); assert_eq!(fg.files, vec!["a1", "a1", "a2", "a2"]); } fn write_test_file(path: &PathBuf, prefix: &[u8], mid: &[u8], suffix: &[u8]) { let mut file = OpenOptions::new() .write(true) .create(true) .open(path) .unwrap(); file.write_all(prefix).unwrap(); file.write_all(mid).unwrap(); file.write_all(suffix).unwrap(); } fn test_log() -> StdLog { let mut log = StdLog::new(); log.no_progress = true; log } } 07070100000013000081A4000000000000000000000001653E86C20000562F000000000000000000000000000000000000002500000000fclones-0.34.0/fclones/src/hasher.rsuse std::cell::RefCell; use std::cmp::{max, min}; use std::fs::{File, OpenOptions}; use std::hash::Hasher; use std::io; use std::io::{Read, Seek}; use std::str::FromStr; use metrohash::MetroHash128; use serde::{Deserialize, Serialize}; #[cfg(feature = "sha2")] use sha2::{Sha256, Sha512}; #[cfg(feature = "sha3")] use sha3::{Sha3_256, Sha3_512}; #[cfg(feature = "xxhash")] use xxhash_rust::xxh3::Xxh3; use crate::cache::{HashCache, Key}; use crate::file::{FileAccess, FileChunk, FileHash, FileLen, FileMetadata, FilePos}; use crate::log::{Log, LogExt}; use crate::path::Path; use crate::transform::Transform; use crate::Error; #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize, clap::ValueEnum)] pub enum HashFn { #[default] Metro, #[cfg(feature = "xxhash")] Xxhash, #[cfg(feature = "blake3")] Blake3, #[cfg(feature = "sha2")] Sha256, #[cfg(feature = "sha2")] Sha512, #[cfg(feature = "sha3")] Sha3_256, #[cfg(feature = "sha3")] Sha3_512, } impl HashFn { pub fn variants() -> Vec<&'static str> { vec![ "metro", #[cfg(feature = "xxhash")] "xxhash3", #[cfg(feature = "blake3")] "blake3", #[cfg(feature = "sha2")] "sha256", #[cfg(feature = "sha2")] "sha512", #[cfg(feature = "sha3")] "sha3-256", #[cfg(feature = "sha3")] "sha3-512", ] } } impl FromStr for HashFn { type Err = String; fn from_str(s: &str) -> Result<Self, Self::Err> { match s.to_lowercase().as_str() { "metro" => Ok(Self::Metro), #[cfg(feature = "xxhash")] "xxhash3" => Ok(Self::Xxhash), #[cfg(feature = "blake3")] "blake3" => Ok(Self::Blake3), #[cfg(feature = "sha2")] "sha256" => Ok(Self::Sha256), #[cfg(feature = "sha2")] "sha512" => Ok(Self::Sha512), #[cfg(feature = "sha3")] "sha3-256" => Ok(Self::Sha3_256), #[cfg(feature = "sha3")] "sha3-512" => Ok(Self::Sha3_512), _ => Err(format!("Unknown hash algorithm: {s}")), } } } /// Computes the hash of a data stream trait StreamHasher { fn new() -> Self; fn update(&mut self, bytes: &[u8]); fn finish(self) -> FileHash; } impl StreamHasher for MetroHash128 { fn new() -> Self { MetroHash128::new() } fn update(&mut self, bytes: &[u8]) { self.write(bytes) } fn finish(self) -> FileHash { let (a, b) = self.finish128(); FileHash::from(((a as u128) << 64) | b as u128) } } #[cfg(feature = "xxhash")] impl StreamHasher for Xxh3 { fn new() -> Self { Xxh3::new() } fn update(&mut self, bytes: &[u8]) { self.update(bytes) } fn finish(self) -> FileHash { FileHash::from(self.digest128()) } } #[cfg(feature = "blake3")] impl StreamHasher for blake3::Hasher { fn new() -> Self { blake3::Hasher::new() } fn update(&mut self, bytes: &[u8]) { self.update(bytes); } fn finish(self) -> FileHash { FileHash::from(self.finalize().as_bytes().as_slice()) } } #[cfg(feature = "sha2")] impl StreamHasher for Sha256 { fn new() -> Self { <Sha256 as sha2::Digest>::new() } fn update(&mut self, bytes: &[u8]) { sha2::Digest::update(self, bytes); } fn finish(self) -> FileHash { use sha2::Digest; let result = self.finalize(); FileHash::from(result.as_slice()) } } #[cfg(feature = "sha2")] impl StreamHasher for Sha512 { fn new() -> Self { <Sha512 as sha2::Digest>::new() } fn update(&mut self, bytes: &[u8]) { sha2::Digest::update(self, bytes); } fn finish(self) -> FileHash { use sha2::Digest; let result = self.finalize(); FileHash::from(result.as_slice()) } } #[cfg(feature = "sha3")] impl StreamHasher for Sha3_256 { fn new() -> Self { <Sha3_256 as sha3::Digest>::new() } fn update(&mut self, bytes: &[u8]) { sha3::Digest::update(self, bytes); } fn finish(self) -> FileHash { use sha3::Digest; let result = self.finalize(); FileHash::from(result.as_slice()) } } #[cfg(feature = "sha3")] impl StreamHasher for Sha3_512 { fn new() -> Self { <Sha3_512 as sha3::Digest>::new() } fn update(&mut self, bytes: &[u8]) { sha3::Digest::update(self, bytes); } fn finish(self) -> FileHash { use sha3::Digest; let result = self.finalize(); FileHash::from(result.as_slice()) } } /// Hashes file contents pub struct FileHasher<'a> { pub(crate) algorithm: HashFn, pub(crate) buf_len: usize, pub(crate) cache: Option<HashCache>, pub(crate) transform: Option<Transform>, pub(crate) log: &'a dyn Log, } impl FileHasher<'_> { /// Creates a hasher with no caching pub fn new(algorithm: HashFn, transform: Option<Transform>, log: &dyn Log) -> FileHasher<'_> { FileHasher { algorithm, buf_len: 65536, cache: None, transform, log, } } /// Creates a default hasher with caching enabled pub fn new_cached( algorithm: HashFn, transform: Option<Transform>, log: &dyn Log, ) -> Result<FileHasher<'_>, Error> { let transform_command_str = transform.as_ref().map(|t| t.command_str.as_str()); let cache = HashCache::open_default(transform_command_str, algorithm)?; Ok(FileHasher { algorithm, buf_len: 65536, cache: Some(cache), transform, log, }) } /// Computes the file hash or logs an error and returns none if failed. /// If file is not found, no error is logged and `None` is returned. pub fn hash_file( &self, chunk: &FileChunk<'_>, progress: impl Fn(usize), ) -> io::Result<FileHash> { let cache = self.cache.as_ref(); let metadata = cache.and_then(|_| FileMetadata::new(chunk.path).ok()); let metadata = metadata.as_ref(); let key = cache .zip(metadata.as_ref()) .and_then(|(c, m)| c.key(chunk, m).ok()); let key = key.as_ref(); let hash = self.load_hash(key, metadata); if let Some((_, hash)) = hash { progress(chunk.len.0 as usize); return Ok(hash); } let hash = match self.algorithm { HashFn::Metro => file_hash::<MetroHash128>(chunk, self.buf_len, progress), #[cfg(feature = "xxhash")] HashFn::Xxhash => file_hash::<Xxh3>(chunk, self.buf_len, progress), #[cfg(feature = "blake3")] HashFn::Blake3 => file_hash::<blake3::Hasher>(chunk, self.buf_len, progress), #[cfg(feature = "sha2")] HashFn::Sha256 => file_hash::<Sha256>(chunk, self.buf_len, progress), #[cfg(feature = "sha2")] HashFn::Sha512 => file_hash::<Sha512>(chunk, self.buf_len, progress), #[cfg(feature = "sha3")] HashFn::Sha3_256 => file_hash::<Sha3_256>(chunk, self.buf_len, progress), #[cfg(feature = "sha3")] HashFn::Sha3_512 => file_hash::<Sha3_512>(chunk, self.buf_len, progress), }?; self.store_hash(key, metadata, chunk.len, hash.clone()); Ok(hash) } pub fn hash_file_or_log_err( &self, chunk: &FileChunk<'_>, progress: impl Fn(usize), ) -> Option<FileHash> { match self.hash_file(chunk, progress) { Ok(hash) => Some(hash), Err(e) if e.kind() == io::ErrorKind::NotFound => None, Err(e) => { self.log.warn(format!( "Failed to compute hash of file {}: {}", chunk.path.to_escaped_string(), e )); None } } } /// Just like `hash_file`, but transforms the file before hashing. pub fn hash_transformed( &self, chunk: &FileChunk<'_>, progress: impl Fn(usize), ) -> io::Result<(FileLen, FileHash)> { assert_eq!(chunk.pos, FilePos::zero()); assert!(self.transform.is_some()); let transform = self.transform.as_ref().unwrap(); let cache = self.cache.as_ref(); let metadata = cache.and_then(|_| FileMetadata::new(chunk.path).ok()); let metadata = metadata.as_ref(); let key = cache .zip(metadata.as_ref()) .and_then(|(c, m)| c.key(chunk, m).ok()); let key = key.as_ref(); let hash = self.load_hash(key, metadata); if let Some(hash) = hash { progress(chunk.len.0 as usize); return Ok(hash); } let mut transform_output = transform.run(chunk.path)?; let stream = &mut transform_output.out_stream; let buf_len = self.buf_len; // Transformed file may have a different length, so we cannot use stream_hash progress // reporting, as it would report progress of the transformed stream. Instead we advance // progress after doing the full file. let hash = match self.algorithm { HashFn::Metro => stream_hash::<MetroHash128>(stream, chunk.len, buf_len, |_| {}), #[cfg(feature = "xxhash")] HashFn::Xxhash => stream_hash::<Xxh3>(stream, chunk.len, buf_len, |_| {}), #[cfg(feature = "blake3")] HashFn::Blake3 => stream_hash::<blake3::Hasher>(stream, chunk.len, buf_len, |_| {}), #[cfg(feature = "sha2")] HashFn::Sha256 => stream_hash::<Sha256>(stream, chunk.len, buf_len, |_| {}), #[cfg(feature = "sha2")] HashFn::Sha512 => stream_hash::<Sha512>(stream, chunk.len, buf_len, |_| {}), #[cfg(feature = "sha3")] HashFn::Sha3_256 => stream_hash::<Sha3_256>(stream, chunk.len, buf_len, |_| {}), #[cfg(feature = "sha3")] HashFn::Sha3_512 => stream_hash::<Sha3_512>(stream, chunk.len, buf_len, |_| {}), }; progress(chunk.len.0 as usize); let hash = hash?; let exit_status = transform_output.child.lock().unwrap().wait()?; if !exit_status.success() { let captured_err = transform_output .err_stream .take() .unwrap() .join() .unwrap_or_else(|_| "".to_owned()); let captured_err = format_output_stream(captured_err.as_str()); return match exit_status.code() { Some(exit_code) => Err(io::Error::new( io::ErrorKind::Other, format!( "{} failed with non-zero status code: {}{}", transform.program, exit_code, captured_err ), )), None => Err(io::Error::new( io::ErrorKind::Other, format!("{} failed{}", transform.program, captured_err), )), }; } self.store_hash(key, metadata, hash.0, hash.1.clone()); Ok(hash) } pub fn hash_transformed_or_log_err( &self, chunk: &FileChunk<'_>, progress: impl Fn(usize), ) -> Option<(FileLen, FileHash)> { match self.hash_transformed(chunk, progress) { Ok(hash) => Some(hash), Err(e) if e.kind() == io::ErrorKind::NotFound => None, Err(e) => { self.log.warn(format!( "Failed to compute hash of file {}: {}", chunk.path.to_escaped_string(), e )); None } } } /// Loads hash from the cache. /// If the hash is not present in the cache, returns `None`. /// If the operation fails (e.g. corrupted cache), logs a warning and returns `None`. fn load_hash( &self, key: Option<&Key>, metadata: Option<&FileMetadata>, ) -> Option<(FileLen, FileHash)> { self.cache .as_ref() .zip(key) .zip(metadata) .and_then(|((cache, key), metadata)| match cache.get(key, metadata) { Ok(len_and_hash) => len_and_hash, Err(e) => { self.log.warn(format!( "Failed to load hash of file id = {key} from the cache: {e}" )); None } }) } /// Stores the hash in the cache. /// If the operation fails (e.g. no space on drive), logs a warning. fn store_hash( &self, key: Option<&Key>, metadata: Option<&FileMetadata>, data_len: FileLen, hash: FileHash, ) { if let Some(((cache, key), metadata)) = self.cache.as_ref().zip(key.as_ref()).zip(metadata.as_ref()) { if let Err(e) = cache.put(key, metadata, data_len, hash) { self.log.warn(format!( "Failed to store hash of file {key} in the cache: {e}" )) } }; } } impl<'a> Drop for FileHasher<'a> { fn drop(&mut self) { if let Some(cache) = self.cache.take() { if let Err(e) = cache.close() { self.log.warn(e); } } } } fn format_output_stream(output: &str) -> String { let output = output.trim().to_string(); if output.is_empty() { output } else { format!("\n{output}\n") } } #[cfg(target_os = "linux")] fn to_off_t(offset: u64) -> libc::off_t { min(libc::off_t::MAX as u64, offset) as libc::off_t } /// Wrapper for `posix_fadvise`. Ignores errors. /// This method is used to advise the system, so its failure is not critical to the result of /// the program. At worst, failure could hurt performance. #[cfg(target_os = "linux")] fn fadvise(file: &File, offset: FilePos, len: FileLen, advice: nix::fcntl::PosixFadviseAdvice) { use std::os::unix::io::AsRawFd; let _ = nix::fcntl::posix_fadvise( file.as_raw_fd(), to_off_t(offset.into()), to_off_t(len.into()), advice, ); } /// Optimizes file read performance based on how many bytes we are planning to read. /// If we know we'll be reading just one buffer, non zero read-ahead would be a cache waste. /// On non-Unix systems, does nothing. /// Failures are not signalled to the caller, but a warning is printed to stderr. #[allow(unused)] fn configure_readahead(file: &File, offset: FilePos, len: FileLen, access: FileAccess) { #[cfg(target_os = "linux")] { use nix::fcntl::*; let advise = |advice: PosixFadviseAdvice| fadvise(file, offset, len, advice); match access { FileAccess::Random => advise(PosixFadviseAdvice::POSIX_FADV_RANDOM), FileAccess::Sequential => advise(PosixFadviseAdvice::POSIX_FADV_SEQUENTIAL), }; } } /// Tells the system to remove given file fragment from the page cache. /// On non-Unix systems, does nothing. #[allow(unused)] fn evict_page_cache(file: &File, offset: FilePos, len: FileLen) { #[cfg(target_os = "linux")] { use nix::fcntl::*; fadvise(file, offset, len, PosixFadviseAdvice::POSIX_FADV_DONTNEED); } } /// Evicts the middle of the file from cache if the system is low on free memory. /// The purpose of this method is to be nice to the data cached by other processes. /// This program is likely to be used only once, so there is little value in keeping its /// data cached for further use. #[allow(unused)] fn evict_page_cache_if_low_mem(file: &mut File, len: FileLen) { #[cfg(target_os = "linux")] { use sysinfo::{System, SystemExt}; let skipped_prefix_len = FileLen(256 * 1024); if len > skipped_prefix_len { let mut system = System::new(); system.refresh_memory(); let free_mem = system.free_memory(); let total_mem = system.total_memory(); let free_ratio = free_mem as f32 / total_mem as f32; if free_ratio < 0.05 { evict_page_cache( file, FilePos::zero() + skipped_prefix_len, len - skipped_prefix_len, ); } } } } /// Opens a file and positions it at the given offset. /// Additionally, sends the advice to the operating system about how many bytes will be read. fn open(path: &Path, offset: FilePos, len: FileLen, access_type: FileAccess) -> io::Result<File> { let mut file = open_noatime(path)?; configure_readahead(&file, offset, len, access_type); if offset > FilePos::zero() { file.seek(offset.into())?; } Ok(file) } /// Opens a file for read. On unix systems passes O_NOATIME flag to drastically improve /// performance of reading small files. fn open_noatime(path: &Path) -> io::Result<File> { let path = path.to_path_buf(); let mut options = OpenOptions::new(); options.read(true); #[cfg(target_os = "linux")] { use std::os::unix::fs::OpenOptionsExt; let mut noatime_opts = options.clone(); noatime_opts.custom_flags(libc::O_NOATIME); noatime_opts .open(&path) // opening with O_NOATIME may fail in some cases for security reasons .or_else(|_| options.open(&path)) } #[cfg(not(target_os = "linux"))] { options.open(path) } } thread_local! { static BUF: RefCell<Vec<u8>> = RefCell::new(Vec::new()); } /// Scans up to `len` bytes in a file and sends data to the given consumer. /// Returns the number of bytes successfully read. fn scan<F: FnMut(&[u8])>( stream: &mut impl Read, len: FileLen, buf_len: usize, mut consumer: F, ) -> io::Result<u64> { BUF.with(|buf| { let mut buf = buf.borrow_mut(); let new_len = max(buf.len(), buf_len); buf.resize(new_len, 0); let mut read: u64 = 0; let len = len.into(); while read < len { let remaining = len - read; let to_read = min(remaining, buf.len() as u64) as usize; let buf = &mut buf[..to_read]; match stream.read(buf) { Ok(0) => break, Ok(actual_read) => { read += actual_read as u64; (consumer)(&buf[..actual_read]); } Err(e) => { return Err(e); } } } Ok(read) }) } /// Computes the hash value over at most `len` bytes of the stream. /// Returns the number of the bytes read and a 128-bit hash value. fn stream_hash<H: StreamHasher>( stream: &mut impl Read, len: FileLen, buf_len: usize, progress: impl Fn(usize), ) -> io::Result<(FileLen, FileHash)> { let mut hasher = H::new(); let mut read_len: FileLen = FileLen(0); scan(stream, len, buf_len, |buf| { hasher.update(buf); read_len += FileLen(buf.len() as u64); (progress)(buf.len()); })?; Ok((read_len, hasher.finish())) } /// Computes hash of initial `len` bytes of a file. /// If the file does not exist or is not readable, print the error to stderr and return `None`. /// The returned hash is not cryptograhically secure. fn file_hash<H: StreamHasher>( chunk: &FileChunk<'_>, buf_len: usize, progress: impl Fn(usize), ) -> io::Result<FileHash> { let access = if chunk.len.0 < 64 * 1024 { FileAccess::Random } else { FileAccess::Sequential }; let mut file = open(chunk.path, chunk.pos, chunk.len, access)?; let hash = stream_hash::<H>(&mut file, chunk.len, buf_len, progress)?.1; evict_page_cache_if_low_mem(&mut file, chunk.len); Ok(hash) } #[cfg(test)] mod test { use metrohash::MetroHash128; use std::io::Write; use tempfile::NamedTempFile; use crate::file::{FileChunk, FileLen, FilePos}; use crate::hasher::{file_hash, StreamHasher}; use crate::path::Path; fn test_file_hash<H: StreamHasher>() { let mut file1 = NamedTempFile::new().unwrap(); file1.write_all(b"Test file 1").unwrap(); let mut file2 = NamedTempFile::new().unwrap(); file2.write_all(b"Test file 2").unwrap(); let file1 = Path::from(&file1); let file2 = Path::from(&file2); let chunk1 = FileChunk::new(&file1, FilePos(0), FileLen::MAX); let chunk2 = FileChunk::new(&file2, FilePos(0), FileLen::MAX); let chunk3 = FileChunk::new(&file2, FilePos(0), FileLen(8)); let hash1 = file_hash::<H>(&chunk1, 4096, |_| {}).unwrap(); let hash2 = file_hash::<H>(&chunk2, 4096, |_| {}).unwrap(); let hash3 = file_hash::<H>(&chunk3, 4096, |_| {}).unwrap(); assert_ne!(hash1, hash2); assert_ne!(hash2, hash3); } #[test] fn test_file_hash_metro_128() { test_file_hash::<MetroHash128>() } #[test] #[cfg(feature = "xxhash")] fn test_file_hash_xxh3() { test_file_hash::<xxhash_rust::xxh3::Xxh3>() } #[test] #[cfg(feature = "blake3")] fn test_file_hash_blake3() { test_file_hash::<blake3::Hasher>() } #[test] #[cfg(feature = "sha2")] fn test_file_hash_sha256() { test_file_hash::<sha2::Sha256>() } #[test] #[cfg(feature = "sha2")] fn test_file_hash_sha512() { test_file_hash::<sha2::Sha512>() } #[test] #[cfg(feature = "sha3")] fn test_file_hash_sha3_256() { test_file_hash::<sha3::Sha3_256>() } #[test] #[cfg(feature = "sha3")] fn test_file_hash_sha3_512() { test_file_hash::<sha3::Sha3_512>() } } 07070100000014000081A4000000000000000000000001653E86C2000002E8000000000000000000000000000000000000002200000000fclones-0.34.0/fclones/src/lib.rspub mod config; pub mod log; pub mod progress; pub mod report; mod arg; mod cache; mod dedupe; mod device; mod error; mod file; mod group; mod hasher; mod lock; mod path; mod pattern; mod phase; mod reflink; mod regex; mod rlimit; mod selector; mod semaphore; mod transform; mod util; mod walk; pub use config::{DedupeConfig, GroupConfig, Priority}; pub use dedupe::{ dedupe, log_script, run_script, sort_by_priority, DedupeOp, DedupeResult, PartitionedFileGroup, PathAndMetadata, }; pub use device::DiskDevices; pub use error::Error; pub use file::{FileHash, FileId, FileInfo, FileLen}; pub use group::{group_files, write_report, FileGroup, FileSubGroup}; pub use path::Path; const TIMESTAMP_FMT: &str = "%Y-%m-%d %H:%M:%S.%3f %z"; 07070100000015000081A4000000000000000000000001653E86C200000A2F000000000000000000000000000000000000002300000000fclones-0.34.0/fclones/src/lock.rsuse crate::error::error_kind; use std::fs::File; use std::{fs, io}; use crate::path::Path; /// Portable file locking. /// /// On Unix, advisory lock through fnctl is used. /// On Windows, file is open in read-write mode. /// /// The file must exist before locking. pub struct FileLock { pub file: File, } impl FileLock { #[cfg(unix)] fn nix_as_io_error<T>(result: nix::Result<T>) -> io::Result<T> { match result { Ok(x) => Ok(x), Err(e) => Err(e.into()), } } /// Creates a libc::flock initialized to zeros. /// Should be safe, because flock contains primitive fields only, no references. #[cfg(unix)] fn new_flock() -> libc::flock { unsafe { std::mem::zeroed() } } #[cfg(unix)] #[allow(clippy::unnecessary_cast)] fn fcntl_lock(file: &File) -> io::Result<()> { use nix::fcntl::*; use std::os::unix::io::AsRawFd; let mut f = Self::new_flock(); f.l_type = libc::F_WRLCK as i16; f.l_whence = libc::SEEK_SET as i16; let result = nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETLK(&f)); Self::nix_as_io_error(result).map(|_| {}) } #[cfg(unix)] #[allow(clippy::unnecessary_cast)] fn fcntl_unlock(file: &File) -> io::Result<()> { use nix::fcntl::*; use std::os::unix::io::AsRawFd; let mut f = Self::new_flock(); f.l_type = libc::F_UNLCK as i16; f.l_whence = libc::SEEK_SET as i16; let result = nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETLK(&f)); Self::nix_as_io_error(result).map(|_| {}) } /// Locks a file and obtains its metadata. /// On error, the error message will contain the path. pub fn new(path: &Path) -> io::Result<FileLock> { let path_buf = path.to_path_buf(); let file = fs::OpenOptions::new() .read(false) .write(true) .create(false) .open(path_buf) .map_err(|e| { io::Error::new( error_kind(&e), format!("Failed to open file {} for write: {}", path.display(), e), ) })?; #[cfg(unix)] if let Err(e) = Self::fcntl_lock(&file) { return Err(io::Error::new( error_kind(&e), format!("Failed to lock file {}: {}", path.display(), e), )); }; Ok(FileLock { file }) } } impl Drop for FileLock { fn drop(&mut self) { #[cfg(unix)] let _ = Self::fcntl_unlock(&self.file); } } 07070100000016000081A4000000000000000000000001653E86C20000154C000000000000000000000000000000000000002200000000fclones-0.34.0/fclones/src/log.rs//! Logging and progress reporting. use std::sync::{Arc, Mutex, Weak}; use console::style; use nom::lib::std::fmt::Display; use crate::progress::{ProgressBar, ProgressTracker}; use chrono::Local; /// Determines the size of the task tracked by ProgressTracker. #[derive(Debug, Clone, Copy)] pub enum ProgressBarLength { Items(u64), Bytes(u64), Unknown, } #[derive(Debug, Clone, Copy)] pub enum LogLevel { Info, Warn, Error, } /// Common interface for logging diagnostics and progress. pub trait Log: Sync + Send { /// Clears any previous progress bar or spinner and installs a new progress bar. fn progress_bar(&self, msg: &str, len: ProgressBarLength) -> Arc<dyn ProgressTracker>; /// Logs a message. fn log(&self, level: LogLevel, msg: String); } /// Additional convenience methods for logging. pub trait LogExt { /// Logs an info message. fn info(&self, msg: impl Display); /// Logs an warning. fn warn(&self, msg: impl Display); /// Logs an error. fn err(&self, msg: impl Display); } /// Additional convenience methods for logging. impl<L: Log + ?Sized> LogExt for L { /// Logs an info message. fn info(&self, msg: impl Display) { self.log(LogLevel::Info, msg.to_string()) } /// Logs an warning. fn warn(&self, msg: impl Display) { self.log(LogLevel::Warn, msg.to_string()) } /// Logs an error. fn err(&self, msg: impl Display) { self.log(LogLevel::Error, msg.to_string()) } } /// A logger that uses standard error stream to communicate with the user. pub struct StdLog { program_name: String, progress_bar: Mutex<Weak<ProgressBar>>, pub log_stderr_to_stdout: bool, pub no_progress: bool, } impl StdLog { pub fn new() -> StdLog { StdLog { progress_bar: Mutex::new(Weak::default()), program_name: std::env::current_exe() .unwrap() .file_name() .unwrap() .to_string_lossy() .to_string(), log_stderr_to_stdout: false, no_progress: false, } } /// Clears any previous progress bar or spinner and installs a new spinner. pub fn spinner(&self, msg: &str) -> Arc<ProgressBar> { if self.no_progress { return Arc::new(ProgressBar::new_hidden()); } self.progress_bar .lock() .unwrap() .upgrade() .iter() .for_each(|pb| pb.finish_and_clear()); let result = Arc::new(ProgressBar::new_spinner(msg)); *self.progress_bar.lock().unwrap() = Arc::downgrade(&result); result } /// Clears any previous progress bar or spinner and installs a new progress bar. pub fn progress_bar(&self, msg: &str, len: u64) -> Arc<ProgressBar> { if self.no_progress { return Arc::new(ProgressBar::new_hidden()); } let result = Arc::new(ProgressBar::new_progress_bar(msg, len)); *self.progress_bar.lock().unwrap() = Arc::downgrade(&result); result } /// Creates a no-op progressbar that doesn't display itself. pub fn hidden(&self) -> Arc<ProgressBar> { Arc::new(ProgressBar::new_hidden()) } /// Clears any previous progress bar or spinner and installs a new progress bar. pub fn bytes_progress_bar(&self, msg: &str, len: u64) -> Arc<ProgressBar> { if self.no_progress { return Arc::new(ProgressBar::new_hidden()); } self.progress_bar .lock() .unwrap() .upgrade() .iter() .for_each(|pb| pb.finish_and_clear()); let result = Arc::new(ProgressBar::new_bytes_progress_bar(msg, len)); *self.progress_bar.lock().unwrap() = Arc::downgrade(&result); result } /// Prints a message to stderr. /// Does not interfere with progress bar. fn eprintln<I: Display>(&self, msg: I) { match self.progress_bar.lock().unwrap().upgrade() { Some(pb) if pb.is_visible() => pb.eprintln(format!("{msg}")), _ if self.log_stderr_to_stdout => println!("{msg}"), _ => eprintln!("{msg}"), } } const TIMESTAMP_FMT: &'static str = "[%Y-%m-%d %H:%M:%S.%3f]"; } impl Log for StdLog { fn progress_bar(&self, msg: &str, len: ProgressBarLength) -> Arc<dyn ProgressTracker> { match len { ProgressBarLength::Items(count) => self.progress_bar(msg, count), ProgressBarLength::Bytes(count) => self.bytes_progress_bar(msg, count), ProgressBarLength::Unknown => self.spinner(msg), } } fn log(&self, level: LogLevel, msg: String) { let timestamp = Local::now(); let level = match level { LogLevel::Info => style(" info:").for_stderr().green(), LogLevel::Warn => style("warn:").for_stderr().yellow(), LogLevel::Error => style("error:").for_stderr().red(), }; let msg = format!( "{} {}: {} {}", style(timestamp.format(Self::TIMESTAMP_FMT)) .for_stderr() .dim() .white(), style(&self.program_name).for_stderr().yellow(), level, msg ); self.eprintln(msg); } } impl Default for StdLog { fn default() -> Self { StdLog::new() } } 07070100000017000081A4000000000000000000000001653E86C200002BA2000000000000000000000000000000000000002300000000fclones-0.34.0/fclones/src/main.rsuse std::collections::HashMap; use std::ffi::{OsStr, OsString}; use std::fs::File; use std::io::{stdin, Write}; use std::process::exit; use std::sync::Arc; use std::{fs, io}; use clap::Parser; use console::style; use fallible_iterator::FallibleIterator; use itertools::Itertools; use regex::Regex; use fclones::config::{Command, Config, DedupeConfig, GroupConfig, Parallelism}; use fclones::log::{Log, LogExt, ProgressBarLength, StdLog}; use fclones::progress::{NoProgressBar, ProgressTracker}; use fclones::report::{open_report, ReportHeader}; use fclones::{dedupe, log_script, run_script, DedupeOp}; use fclones::{group_files, write_report, Error}; /// Strips a red "error:" prefix and usage information added by clap. /// Removes ansi formatting. /// Joins all lines into a single line. fn extract_error_cause(message: &str) -> String { let drop_ansi = Regex::new(r"\x1b\[[0-9;]*m").unwrap(); let drop_error = Regex::new("error:[^ ]* ").unwrap(); let message = drop_ansi.replace_all(message, ""); let message = drop_error.replace(&message, ""); message .split('\n') .take_while(|l| !l.starts_with("USAGE:")) .map(|l| l.trim()) .filter(|l| !l.is_empty()) .join(" ") } /// Returns error if any of the input paths doesn't exist or if input paths list is empty. fn check_input_paths_exist(config: &GroupConfig, log: &dyn Log) -> Result<(), Error> { // Unfortunately we can't fail fast here when the list of files // is streamed from the standard input, because we'd have to collect all paths into a vector // list first, but we don't want to do this because there may be many. // In that case, we just let the lower layers handle eventual // problems and report as warnings. if config.stdin { return Ok(()); } // If files aren't streamed on stdin, we can inspect all of them now // and exit early on any access error. If depth is set to 0 (recursive scan disabled) // we also want to filter out directories and terminate with an error if there are // no files in the input. let mut access_error = false; let depth = config.depth; let input_paths = config .input_paths() .filter(|p| match fs::metadata(p.to_path_buf()) { Ok(m) if m.is_dir() && depth == Some(0) => { log.warn(format!( "Skipping directory {} because recursive scan is disabled.", p.display() )); false } Err(e) => { log.err(format!("Can't access {}: {}", p.display(), e)); access_error = true; false } Ok(_) => true, }) .collect_vec(); if access_error { return Err(Error::from("Some input paths could not be accessed.")); } if input_paths.is_empty() { return Err(Error::from("No input files.")); } Ok(()) } /// Attempts to create the output file and returns an error if it fails. fn check_can_create_output_file(config: &GroupConfig) -> Result<(), Error> { if let Some(output) = &config.output { if let Err(e) = File::create(output) { return Err(Error::new(format!( "Cannot create output file {}: {}", output.display(), e ))); } } Ok(()) } /// Configures global thread pool to use desired number of threads fn configure_main_thread_pool(pool_sizes: &HashMap<OsString, Parallelism>) { let parallelism = pool_sizes.get(OsStr::new("main")).unwrap_or_else(|| { pool_sizes .get(OsStr::new("default")) .unwrap_or(&Parallelism { sequential: 0, random: 0, }) }); rayon::ThreadPoolBuilder::new() .num_threads(parallelism.random) .build_global() .unwrap(); } fn run_group(mut config: GroupConfig, log: &dyn Log) -> Result<(), Error> { config.resolve_base_dir().map_err(|e| e.to_string())?; check_input_paths_exist(&config, log)?; check_can_create_output_file(&config)?; configure_main_thread_pool(&config.thread_pool_sizes()); log.info("Started grouping"); let results = group_files(&config, log).map_err(|e| Error::new(e.message))?; write_report(&config, log, &results) .map_err(|e| Error::new(format!("Failed to write report: {e}"))) } /// Depending on the `output` configuration field, returns either a reference to the standard /// output or a file opened for writing. /// Reports error if the output file cannot be created. fn get_output_writer(config: &DedupeConfig) -> Result<Box<dyn Write + Send>, Error> { match &config.output { Some(path) => { let f = File::create(path) .map_err(|e| format!("Failed to create output file {}: {}", path.display(), e))?; Ok(Box::new(f)) } None => Ok(Box::new(io::stdout())), } } /// Returns the configuration of a previously executed fclones command, /// stored in the report header. fn get_command_config(header: &ReportHeader) -> Result<Config, Error> { let mut command: Config = Config::try_parse_from(&header.command).map_err(|e| { let message: String = extract_error_cause(&e.to_string()); format!("Unrecognized earlier fclones configuration: {message}") })?; // Configure the same base directory as set when running the previous command. // This is important to get the correct input paths. if let Command::Group(ref mut group_config) = command.command { group_config.base_dir = header.base_dir.clone(); } Ok(command) } pub fn run_dedupe(op: DedupeOp, config: DedupeConfig, log: &dyn Log) -> Result<(), Error> { let input_error = |e: io::Error| format!("Input error: {e}"); let mut dedupe_config = config; let mut reader = open_report(stdin()).map_err(input_error)?; let header = reader.read_header().map_err(input_error)?; let prev_command_config = get_command_config(&header)?; if let Command::Group(c) = &prev_command_config.command { // we cannot check size if a transformation was applied, because the transformation // may change the size of the data and the recorded data size // would not match the physical size of the file dedupe_config.no_check_size |= c.transform.is_some(); dedupe_config.match_links |= c.match_links; if dedupe_config.rf_over.is_none() { dedupe_config.rf_over = Some(c.rf_over()) } if dedupe_config.isolated_roots.is_empty() && c.isolate { dedupe_config.isolated_roots = c.input_paths().collect(); } } if dedupe_config.rf_over.is_none() { return Err(Error::from( "Could not extract --rf-over setting from the earlier fclones configuration. \ Please set --rf-over explicitly.", )); }; if dedupe_config.modified_before.is_none() { dedupe_config.modified_before = Some(header.timestamp); } if dedupe_config.dry_run { log.info("Started deduplicating (dry run)"); } else { log.info("Started deduplicating"); } let mut result: Result<(), io::Error> = Ok(()); let group_count = header.stats.map(|s| s.group_count as u64); let progress: Arc<dyn ProgressTracker> = match group_count { _ if dedupe_config.dry_run && dedupe_config.output.is_none() => Arc::new(NoProgressBar), Some(group_count) => { log.progress_bar("Deduplicating", ProgressBarLength::Items(group_count)) } None => log.progress_bar("Deduplicating", ProgressBarLength::Unknown), }; let groups = reader.read_groups(); let groups = groups .map_err(input_error)? .iterator() .map(|g| match g { Ok(g) => Some(g), Err(e) => { result = Err(e); None } }) .take_while(|g| g.is_some()) .map(|g| g.unwrap()) .inspect(|_| progress.inc(1)); let upto = if op == DedupeOp::RefLink { // Can't be sure because any previous deduplications are not // visible without calling fs-specific tooling. "up to " } else { "" }; let script = dedupe(groups, op, &dedupe_config, log); if dedupe_config.dry_run { let out = get_output_writer(&dedupe_config)?; let result = log_script(script, out).map_err(|e| format!("Output error: {e}"))?; log.info(format!( "Would process {} files and reclaim {}{} space", result.processed_count, upto, result.reclaimed_space )); } else { let result = run_script(script, !dedupe_config.no_lock, log); log.info(format!( "Processed {} files and reclaimed {}{} space", result.processed_count, upto, result.reclaimed_space )); }; result.map_err(|e| Error::new(format!("Failed to read file list: {e}"))) } fn main() { let config: Config = Config::parse(); if let Err(e) = config.command.validate() { eprintln!("{} {}", style("error:").for_stderr().bold().red(), e); exit(1); } let mut log = StdLog::new(); if config.quiet { log.no_progress = true; } let cwd = match std::env::current_dir() { Ok(cwd) => cwd, Err(e) => { log.err(format!("Cannot determine current working directory: {e}")); exit(1); } }; let result = match config.command { Command::Group(config) => run_group(config, &log), Command::Remove(config) => run_dedupe(DedupeOp::Remove, config, &log), Command::Link { config, soft: true } => run_dedupe(DedupeOp::SymbolicLink, config, &log), Command::Link { config, soft: false, } => run_dedupe(DedupeOp::HardLink, config, &log), Command::Dedupe { config, .. } => { if cfg!(windows) { log.err("Command \"dedupe\" is unsupported on Windows"); exit(1); } run_dedupe(DedupeOp::RefLink, config, &log) } Command::Move { config, target } => { let target = fclones::Path::from(target); let target = Arc::new(fclones::Path::from(cwd)).resolve(target); run_dedupe(DedupeOp::Move(Arc::new(target)), config, &log) } }; if let Err(e) = result { if !e.message.is_empty() { log.err(e); } exit(1); } } #[cfg(test)] mod test { #[test] fn test_extract_error_cause_strips_error_prefix() { assert_eq!(super::extract_error_cause("error: foo"), "foo"); } #[test] fn test_extract_error_cause_joins_lines() { assert_eq!( super::extract_error_cause("line1:\n line2"), "line1: line2" ); } #[test] fn test_extract_error_cause_strips_usage() { assert_eq!( super::extract_error_cause("error message\n\nUSAGE:\n blah blah blah"), "error message" ); } } 07070100000018000081A4000000000000000000000001653E86C200003FBD000000000000000000000000000000000000002300000000fclones-0.34.0/fclones/src/path.rs//! Memory-efficient file path representation. use std::ffi::{CStr, CString, OsString}; use std::fmt; use std::hash::Hash; use std::path::{Component, PathBuf}; use std::sync::Arc; use metrohash::MetroHash128; use nom::lib::std::fmt::Formatter; use serde::de::{Error, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use smallvec::SmallVec; use stfu8::DecodeError; use crate::arg; use crate::arg::{from_stfu8, to_stfu8}; use crate::path::string::{c_to_os_str, os_to_c_str}; #[cfg(unix)] pub const PATH_ESCAPE_CHAR: &str = "\\"; #[cfg(windows)] pub const PATH_ESCAPE_CHAR: &str = "^"; #[cfg(unix)] const ROOT_BYTES: &[u8] = b"/"; #[cfg(windows)] const ROOT_BYTES: &[u8] = b"\\"; /// Memory-efficient file path representation. /// /// When storing multiple paths with common parent, the standard [`PathBuf`] /// would keep the parent path text duplicated in memory, wasting a lot of memory. /// This structure here shares the common parent between many paths by reference-counted /// references. #[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct Path { parent: Option<Arc<Path>>, component: CString, } impl Path { pub fn canonicalize(&self) -> Path { let path_buf = self.to_path_buf(); match dunce::canonicalize(path_buf.clone()) { Ok(p) => Path::from(p), Err(_) => Path::from(path_buf), } } pub fn is_absolute(&self) -> bool { self.root().is_some() } pub fn is_relative(&self) -> bool { self.root().is_none() } /// Returns the absolute root of the path if the path is absolute. /// In Unix, returns "/". /// In Windows this can return a root with prefix e.g. "C:\". /// If path is relative, returns None. pub fn root(&self) -> Option<&Path> { let mut result = self; loop { if result.component.as_bytes() == ROOT_BYTES { return Some(result); } if let Some(parent) = &result.parent { result = parent.as_ref() } else { break; } } None } /// Moves this [`Path`] under an [`Arc`]. /// You need to wrap [`Path`] before joining anything to it. pub fn share(self) -> Arc<Self> { Arc::new(self) } /// Copies this path from under an [`Arc`]. /// Generally cheap, because only the last component is copied. pub fn unshare(self: &Arc<Path>) -> Path { self.as_ref().clone() } /// Creates an owned [`Path`] with `path` adjoined to `self`. /// The `path` must be relative. pub fn join<P: AsRef<Path>>(self: &Arc<Path>, path: P) -> Path { let path = path.as_ref(); assert!(path.is_relative()); let components = path.components(); let mut iter = components.iter(); let mut result = self.push(CString::from(*iter.next().unwrap())); for &c in iter { result = Arc::new(result).push(CString::from(c)); } result } /// If `path` is relative, works the same as [`join`](Path::join). /// If `path` is absolute, ignores `self` and returns `path`. pub fn resolve<P: AsRef<Path>>(self: &Arc<Path>, path: P) -> Path { let path = path.as_ref(); if path.is_relative() { self.join(path) } else { path.clone() } } /// Returns the name of the last component of this path or None /// if the path is directory (e.g. root dir or parent dir). pub fn file_name(&self) -> Option<OsString> { match self.component.as_bytes() { b"/" => None, b".." => None, b"." => None, _ => Some(c_to_os_str(self.component.as_c_str())), } } /// Returns the name of the last component of this path or None /// if the path is directory (e.g. root dir or parent dir). /// Doesn't allocate anything on the heap. pub fn file_name_cstr(&self) -> Option<&CStr> { match self.component.as_bytes() { b"/" => None, b".." => None, b"." => None, _ => Some(self.component.as_c_str()), } } /// Returns the parent directory of this path. /// Doesn't allocate anything on the heap. pub fn parent(&self) -> Option<&Arc<Path>> { self.parent.as_ref() } /// Returns a path that joined to `base` would give this path. /// If base is the same as this path, returns current directory. /// If this path doesn't have a `base` prefix, returns `None`. pub fn strip_prefix(&self, base: &Path) -> Option<Path> { let mut self_components = self.components().into_iter().peekable(); let mut base_components = base.components().into_iter().peekable(); while let (Some(a), Some(b)) = (self_components.peek(), base_components.peek()) { if a != b { return None; } self_components.next(); base_components.next(); } Some(Path::make(self_components)) } /// If this path is absolute, strips the root component and returns a relative path. /// Otherwise returns a clone of this path. /// E.g. `/foo/bar` becomes `foo/bar` pub fn strip_root(&self) -> Path { if let Some(root) = self.root() { self.strip_prefix(root).unwrap() } else { self.clone() } } /// Returns true if self is a prefix of another path pub fn is_prefix_of(&self, other: &Path) -> bool { let mut self_components = self.components().into_iter().peekable(); let mut other_components = other.components().into_iter().peekable(); while let (Some(a), Some(b)) = (self_components.peek(), other_components.peek()) { if a != b { return false; } self_components.next(); other_components.next(); } self_components.peek().is_none() } /// Converts this path to a standard library path buffer. /// We need this to be able to use this path with other standard library I/O functions. pub fn to_path_buf(&self) -> PathBuf { let mut result = PathBuf::from(OsString::with_capacity(self.capacity())); self.for_each_component(|c| result.push(c_to_os_str(c))); result } /// Converts this path to an UTF encoded string. /// Any non-Unicode sequences are replaced with /// [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]. pub fn to_string_lossy(&self) -> String { self.to_path_buf().to_string_lossy().to_string() } /// Returns a lossless string representation in [STFU8 format](https://crates.io/crates/stfu8). pub fn to_escaped_string(&self) -> String { to_stfu8(self.to_path_buf().into_os_string()) } /// Decodes the path from the string encoded with [`to_escaped_string`](Path::to_escaped_string). pub fn from_escaped_string(encoded: &str) -> Result<Path, DecodeError> { Ok(Path::from(from_stfu8(encoded)?)) } /// Formats the path in a way that Posix-shell can decode it. /// If the path doesn't contain any special characters, returns it as-is. /// If the path contains special shell characters like '\\' or '*', it is single-quoted. /// This function also takes care of the characters that cannot be represented in UTF-8 /// by escaping them with `$'\xXX'` or `$'\uXXXX'` syntax. pub fn quote(&self) -> String { arg::quote(self.to_path_buf().into_os_string()) } /// Returns a representation suitable for display in the console. /// Control characters like newline or linefeed are escaped. pub fn display(&self) -> String { self.quote() } /// Returns a hash of the full path. Useful for deduplicating paths without making path clones. /// We need 128-bits so that collisions are not a problem. /// Thanks to using a long hash we can be sure collisions won't be a problem. pub fn hash128(&self) -> u128 { let mut hasher = MetroHash128::new(); self.hash(&mut hasher); let (a, b) = hasher.finish128(); (a as u128) << 64 | (b as u128) } fn new(component: CString) -> Path { Path { component, parent: None, } } fn push(self: &Arc<Path>, component: CString) -> Path { Path { component, parent: Some(self.clone()), } } /// Flattens this path to a vector of strings fn components(&self) -> SmallVec<[&CStr; 16]> { let mut result = match &self.parent { Some(p) => p.components(), None => SmallVec::new(), }; result.push(&self.component); result } /// Returns the number of components in this path pub fn component_count(&self) -> usize { let mut count = 0; self.for_each_component(|_| count += 1); count } /// Executes a function for each component, left to right fn for_each_component<F: FnMut(&CStr)>(&self, mut f: F) { self.for_each_component_ref(&mut f) } /// Executes a function for each component, left to right fn for_each_component_ref<F: FnMut(&CStr)>(&self, f: &mut F) { self.parent.iter().for_each(|p| p.for_each_component_ref(f)); (f)(self.component.as_c_str()) } /// Estimates size of this path in bytes fn capacity(&self) -> usize { let mut result: usize = 0; self.for_each_component(|c| result += c.to_bytes().len() + 1); result } /// Builds a path from individual string components. /// If the component list is empty, returns a path pointing to the current directory ("."). fn make<'a, I>(components: I) -> Path where I: IntoIterator<Item = &'a CStr> + 'a, { let mut iter = components.into_iter(); let first = iter.next(); let mut result: Path = match first { None => Path::new(CString::new(".").unwrap()), Some(c) => Path::new(CString::from(c)), }; for c in iter { result = Arc::new(result).push(CString::from(c)) } result } } impl AsRef<Path> for Path { fn as_ref(&self) -> &Path { self } } impl Default for Path { fn default() -> Self { Path::from(".") } } /// Converts std path Component to a new CString fn component_to_c_string(c: &Component<'_>) -> CString { os_to_c_str(c.as_os_str()) } impl<P> From<P> for Path where P: AsRef<std::path::Path>, { fn from(p: P) -> Self { let p = p.as_ref(); let mut components = p.components(); let mut result = Path::new(component_to_c_string( &components.next().unwrap_or(Component::CurDir), )); for c in components { result = Arc::new(result).push(component_to_c_string(&c)) } result } } impl Serialize for Path { fn serialize<S>(&self, serializer: S) -> Result<<S as Serializer>::Ok, <S as Serializer>::Error> where S: Serializer, { serializer.collect_str(self.to_escaped_string().as_str()) } } struct PathVisitor; impl Visitor<'_> for PathVisitor { type Value = Path; fn expecting(&self, formatter: &mut Formatter<'_>) -> fmt::Result { formatter.write_str("path string") } fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> where E: Error, { Path::from_escaped_string(v).map_err(|e| E::custom(format!("Invalid path: {e}"))) } } impl<'de> Deserialize<'de> for Path { fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: Deserializer<'de>, { deserializer.deserialize_str(PathVisitor) } } impl fmt::Debug for Path { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self.to_path_buf()) } } mod string { use std::ffi::{CStr, CString, OsStr, OsString}; #[cfg(unix)] pub fn c_to_os_str(str: &CStr) -> OsString { use std::os::unix::ffi::OsStrExt; OsStr::from_bytes(str.to_bytes()).to_os_string() } #[cfg(unix)] pub fn os_to_c_str(str: &OsStr) -> CString { use std::os::unix::ffi::OsStrExt; CString::new(str.as_bytes()).unwrap() } #[cfg(windows)] pub fn c_to_os_str(str: &CStr) -> OsString { OsString::from(str.to_str().unwrap()) } #[cfg(windows)] pub fn os_to_c_str(str: &OsStr) -> CString { CString::new(str.to_str().unwrap().as_bytes()).unwrap() } } #[cfg(test)] mod test { use super::*; use serde_test::{assert_ser_tokens, Token}; fn test_convert(s: &str) { assert_eq!(PathBuf::from(s), Path::from(s).to_path_buf()); } #[test] fn convert() { test_convert("/"); test_convert("/bar"); test_convert("/foo/bar"); test_convert("."); test_convert("./foo/bar"); test_convert("../foo/bar"); test_convert(".."); test_convert("foo"); test_convert("foo/bar/baz"); test_convert("foo/bar/baz"); } #[test] fn file_name() { assert_eq!( Path::from("foo").file_name_cstr(), Some(CString::new("foo").unwrap().as_c_str()) ); assert_eq!( Path::from("foo/bar").file_name_cstr(), Some(CString::new("bar").unwrap().as_c_str()) ); assert_eq!( Path::from("/foo").file_name_cstr(), Some(CString::new("foo").unwrap().as_c_str()) ); assert_eq!( Path::from("/foo/bar").file_name_cstr(), Some(CString::new("bar").unwrap().as_c_str()) ); assert_eq!(Path::from("/").file_name_cstr(), None); assert_eq!(Path::from(".").file_name_cstr(), None); assert_eq!(Path::from("..").file_name_cstr(), None); } #[test] fn parent() { assert_eq!( Path::from("foo/bar").parent(), Some(&Arc::new(Path::from("foo"))) ); assert_eq!( Path::from("/foo").parent(), Some(&Arc::new(Path::from("/"))) ); assert_eq!(Path::from("/").parent(), None); } #[test] fn share_parents() { let parent = Path::from("/parent").share(); let child1 = parent.join(Path::from("c1")); let child2 = parent.join(Path::from("c2")); assert_eq!(PathBuf::from("/parent/c1"), child1.to_path_buf()); assert_eq!(PathBuf::from("/parent/c2"), child2.to_path_buf()); } #[test] fn is_absolute() { assert!(Path::from("/foo/bar").is_absolute()); assert!(!Path::from("foo/bar").is_absolute()); assert!(!Path::from("./foo/bar").is_absolute()); assert!(!Path::from("../foo/bar").is_absolute()); } #[test] fn strip_prefix() { assert_eq!( Path::from("/foo/bar").strip_prefix(&Path::from("/foo")), Some(Path::from("bar")) ); assert_eq!( Path::from("/foo/bar").strip_prefix(&Path::from("/foo/bar")), Some(Path::from(".")) ); assert_eq!( Path::from("/foo/bar").strip_prefix(&Path::from("/bar")), None ); } #[test] fn is_prefix_of() { assert!(Path::from("/foo/bar").is_prefix_of(&Path::from("/foo/bar"))); assert!(Path::from("/foo/bar").is_prefix_of(&Path::from("/foo/bar/baz"))); assert!(!Path::from("/foo/bar").is_prefix_of(&Path::from("/foo"))) } #[test] fn encode_decode_stfu8() { fn roundtrip(s: &str) { assert_eq!(Path::from_escaped_string(s).unwrap().to_escaped_string(), s) } roundtrip("a/b/c"); roundtrip("Ä…/Å›/ć"); roundtrip("a \\n b"); roundtrip("a \\t b"); roundtrip("a \\x7F b"); } #[test] fn root() { assert!(Path::from("foo/bar").root().is_none()); assert_eq!(Path::from("/foo/bar").root().unwrap(), &Path::from("/")); assert_eq!(Path::from("/foo/bar").strip_root(), Path::from("foo/bar")); } #[test] fn serialize() { assert_ser_tokens(&Path::from("a \n b"), &[Token::String("a \\n b")]) } } 07070100000019000081A4000000000000000000000001653E86C2000043BB000000000000000000000000000000000000002600000000fclones-0.34.0/fclones/src/pattern.rsuse std::fmt::{Display, Formatter}; use std::ops::Add; use std::path::{Path, MAIN_SEPARATOR}; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{anychar, none_of}; use nom::combinator::{cond, map}; use nom::multi::{many0, separated_list0}; use nom::sequence::tuple; use nom::IResult; use regex::escape; use crate::path::PATH_ESCAPE_CHAR; use crate::regex::Regex; use std::str::FromStr; #[derive(Debug)] pub struct PatternError { pub cause: String, pub input: String, } impl Display for PatternError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, "Failed to compile pattern '{}': {}", self.input, self.cause ) } } impl std::error::Error for PatternError {} /// Pattern for matching paths and file names. /// Can be constructed from a glob pattern or a raw regular expression. #[derive(Clone, Debug)] pub struct Pattern { src: String, anchored_regex: Regex, prefix_regex: Regex, } impl FromStr for Pattern { type Err = PatternError; fn from_str(s: &str) -> Result<Self, Self::Err> { Pattern::glob(s) } } #[derive(Default)] pub struct PatternOpts { case_insensitive: bool, } impl PatternOpts { pub fn case_insensitive() -> PatternOpts { PatternOpts { case_insensitive: true, } } } #[derive(PartialEq, Debug)] enum Scope { TopLevel, CurlyBrackets, RoundBrackets, } impl Pattern { /// Creates `Pattern` instance from raw regular expression. Supports PCRE syntax. pub fn regex(pattern: &str) -> Result<Pattern, PatternError> { Self::regex_with(pattern, &PatternOpts::default()) } /// Creates `Pattern` instance from raw regular expression. Supports PCRE syntax. /// Allows to specify case sensitivity pub fn regex_with(pattern: &str, opts: &PatternOpts) -> Result<Pattern, PatternError> { let pattern = pattern.trim_start_matches('^'); let pattern = pattern.trim_end_matches('$'); let pattern = pattern.to_string(); let anchored_regex = "^".to_string() + &pattern + "$"; let anchored_regex = Regex::new(anchored_regex.as_str(), opts.case_insensitive); let prefix_regex = "^".to_string() + &pattern; let prefix_regex = Regex::new(prefix_regex.as_str(), opts.case_insensitive); match anchored_regex { Ok(anchored_regex) => Ok(Pattern { src: pattern, anchored_regex, prefix_regex: prefix_regex.unwrap(), }), Err(e) => Err(PatternError { input: pattern, cause: e.to_string(), }), } } /// Creates a `Pattern` that matches literal string. Case insensitive. /// Special characters in the string are escaped before creating the underlying regex. pub fn literal(s: &str) -> Pattern { Self::regex(escape(s).as_str()).unwrap() } /// Creates `Pattern` instance from a Unix extended glob. /// Case insensitive. For syntax reference see [glob_with](glob_with). pub fn glob(pattern: &str) -> Result<Pattern, PatternError> { Self::glob_with(pattern, &PatternOpts::default()) } /// Creates `Pattern` instance from a Unix extended glob. /// /// Glob patterns handle the following wildcards: /// - `?`: matches any character /// - `*`: matches any sequence of characters except the directory separator /// - `**`: matches any sequence of characters /// - `[a-z]`: matches one of the characters or character ranges given in the square brackets /// - `[!a-z]`: matches any character that is not given in the square brackets /// - `{a,b}`: matches exactly one pattern from the comma-separated patterns given inside the curly brackets /// - `@(a|b)`: same as `{a,b}` /// - `?(a|b)`: matches at most one occurrence of the pattern inside the brackets /// - `+(a|b)`: matches at least occurrence of the patterns given inside the brackets /// - `*(a|b)`: matches any number of occurrences of the patterns given inside the brackets /// - `!(a|b)`: matches anything that doesn't match any of the patterns given inside the brackets /// /// Use `\` to escape the special symbols that need to be matched literally. E.g. `\*` matches /// a single `*` character. /// pub fn glob_with(glob: &str, opts: &PatternOpts) -> Result<Pattern, PatternError> { let result: IResult<&str, String> = Self::glob_to_regex(Scope::TopLevel, glob); match result { Ok((remaining, regex)) if remaining.is_empty() => { Self::regex_with(regex.as_str(), opts) } Ok((remaining, _)) => Err(PatternError { input: glob.to_string(), cause: format!( "Unexpected '{}' at end of input", remaining.chars().next().unwrap() ), }), Err(e) => Err(PatternError { input: glob.to_string(), cause: e.to_string(), }), } } /// Returns true if this pattern fully matches the given path pub fn matches(&self, path: &str) -> bool { self.anchored_regex.is_match(path) } /// Returns true if a prefix of this pattern fully matches the given path pub fn matches_partially(&self, path: &str) -> bool { self.anchored_regex.is_partial_match(path) } /// Returns true if this pattern fully matches a prefix of the given path pub fn matches_prefix(&self, path: &str) -> bool { self.prefix_regex.is_match(path) } /// Returns true if this pattern fully matches given file path pub fn matches_path(&self, path: &Path) -> bool { self.anchored_regex .is_match(path.to_string_lossy().as_ref()) } /// Parses a UNIX glob and converts it to a regular expression fn glob_to_regex(scope: Scope, glob: &str) -> IResult<&str, String> { // pass escaped characters as-is: let p_escaped = map(tuple((tag(PATH_ESCAPE_CHAR), anychar)), |(_, c)| { escape(c.to_string().as_str()) }); fn mk_string(contents: Vec<String>, prefix: &str, sep: &str, suffix: &str) -> String { format!("{}{}{}", prefix, contents.join(sep), suffix) } // { glob1, glob2, ..., globN } -> ( regex1, regex2, ..., regexN ) let p_alt = map( tuple(( tag("{"), separated_list0(tag(","), |g| Self::glob_to_regex(Scope::CurlyBrackets, g)), tag("}"), )), |(_, list, _)| mk_string(list, "(", "|", ")"), ); let p_ext_glob = |s| { map( tuple(( tag("("), separated_list0(tag("|"), |g| Self::glob_to_regex(Scope::RoundBrackets, g)), tag(")"), )), |(_, list, _)| list, )(s) }; let p_ext_optional = map(tuple((tag("?"), p_ext_glob)), |(_, g)| { mk_string(g, "(", "|", ")?") }); let p_ext_many = map(tuple((tag("*"), p_ext_glob)), |(_, g)| { mk_string(g, "(", "|", ")*") }); let p_ext_at_least_once = map(tuple((tag("+"), p_ext_glob)), |(_, g)| { mk_string(g, "(", "|", ")+") }); let p_ext_exactly_once = map(tuple((tag("@"), p_ext_glob)), |(_, g)| { mk_string(g, "(", "|", ")") }); let p_ext_never = map(tuple((tag("!"), p_ext_glob)), |(_, g)| { mk_string(g, "(?!", "|", ")") }); // ** -> .* let p_double_star = map(tag("**"), |_| ".*".to_string()); let escaped_sep = escape(MAIN_SEPARATOR.to_string().as_str()); // * -> [^/]* let p_single_star = |s| map(tag("*"), |_| "[^".to_string() + escaped_sep.as_str() + "]*")(s); // ? -> . let p_question_mark = |s| map(tag("?"), |_| "[^".to_string() + escaped_sep.as_str() + "]")(s); // [ characters ] -> [ characters ] let p_neg_character_set = map( tuple((tag("[!"), many0(none_of("]")), tag("]"))), |(_, characters, _)| { "[^".to_string() + &characters.into_iter().collect::<String>() + "]" }, ); // [ characters ] -> [ characters ] let p_character_set = map( tuple((tag("["), many0(none_of("]")), tag("]"))), |(_, characters, _)| { "[".to_string() + &characters.into_iter().collect::<String>() + "]" }, ); let p_separator = map(tag("/"), |_| escaped_sep.clone()); // if we are nested, we can't just pass these through without interpretation let p_any_char = map( tuple(( cond(scope == Scope::TopLevel, anychar), cond(scope == Scope::CurlyBrackets, none_of("{,}")), cond(scope == Scope::RoundBrackets, none_of("(|)")), )), |(a, b, c)| escape(a.or(b).or(c).unwrap().to_string().as_str()), ); let p_token = alt(( p_escaped, p_alt, p_ext_optional, p_ext_many, p_ext_at_least_once, p_ext_exactly_once, p_ext_never, p_double_star, p_single_star, p_question_mark, p_neg_character_set, p_character_set, p_separator, p_any_char, )); let mut parse_all = map(many0(p_token), |s| s.join("")); (parse_all)(glob) } } impl Add<Pattern> for Pattern { type Output = Pattern; fn add(self, rhs: Pattern) -> Self::Output { Pattern::regex((self.to_string() + &rhs.to_string()).as_str()).unwrap() } } impl ToString for Pattern { fn to_string(&self) -> String { self.src.clone() } } #[cfg(test)] mod test { use std::path::PathBuf; use super::*; fn glob_to_regex_str(glob: &str) -> String { Pattern::glob(glob).unwrap().to_string() } fn native_dir_sep(str: &str) -> String { str.replace('/', MAIN_SEPARATOR.to_string().as_str()) } #[test] fn empty() { assert_eq!(glob_to_regex_str(""), ""); } #[test] fn output_escaping() { assert_eq!(glob_to_regex_str("foo.jpg"), "foo\\.jpg"); assert_eq!(glob_to_regex_str("foo(bar)"), "foo\\(bar\\)"); } #[test] fn input_escaping() { assert_eq!(glob_to_regex_str("foo\\*"), "foo\\*"); assert_eq!(glob_to_regex_str("foo\\?"), "foo\\?"); assert_eq!(glob_to_regex_str("foo\\{"), "foo\\{"); assert_eq!(glob_to_regex_str("foo\\}"), "foo\\}"); } #[test] fn question_mark() { let p = Pattern::glob("foo???").unwrap(); assert!(p.matches("foo123")); assert!(!p.matches_path(&PathBuf::from("foo").join("23"))); } #[test] fn single_star() { let p = Pattern::glob("foo*").unwrap(); assert!(p.matches("foo123")); assert!(!p.matches(native_dir_sep("foo/bar").as_str())); } #[test] fn double_star() { let p = Pattern::glob("foo/**/bar").unwrap(); assert!(p.matches(native_dir_sep("foo/1/2/bar").as_str())); } #[test] fn character_set() { assert_eq!(glob_to_regex_str("[a-b.*?-]"), "[a-b.*?-]"); assert_eq!(glob_to_regex_str("[!a-b.*?-]"), "[^a-b.*?-]"); } #[test] fn alternatives() { assert_eq!(glob_to_regex_str("{a,b,c}"), "(a|b|c)"); let p = Pattern::glob("{*.jpg,*.JPG}").unwrap(); assert!(p.matches("foo.jpg")); assert!(p.matches("foo.JPG")); } #[test] fn nested_alternatives() { assert_eq!(glob_to_regex_str("{a,{b,c}}"), "(a|(b|c))"); } #[test] fn naked_comma() { assert_eq!(glob_to_regex_str("a,b,c"), "a,b,c"); } #[test] fn naked_bar() { assert_eq!(glob_to_regex_str("a|b|c"), "a\\|b\\|c"); } #[test] fn unbalanced_paren() { // this is how bash interprets unbalanced paren assert_eq!(glob_to_regex_str("{a,b,c"), "\\{a,b,c"); assert_eq!(glob_to_regex_str("a,b,c}"), "a,b,c\\}"); assert_eq!(glob_to_regex_str("{{a,b}"), "\\{(a|b)"); assert_eq!(glob_to_regex_str("{a,b}}"), "(a|b)\\}"); assert_eq!(glob_to_regex_str("{{{a,b}"), "\\{\\{(a|b)"); assert_eq!(glob_to_regex_str("{{{a,b}}"), "\\{((a|b))"); } #[test] fn literal() { assert_eq!( Pattern::literal("test*?{}\\").to_string(), "test\\*\\?\\{\\}\\\\" ) } #[test] fn case_insensitive() { let p = Pattern::glob_with("foo", &PatternOpts::case_insensitive()).unwrap(); assert!(p.matches("foo")); assert!(p.matches("Foo")); assert!(p.matches("FOO")); } #[test] fn add() { assert_eq!( (Pattern::literal("/foo/bar/") + Pattern::glob("*").unwrap()).to_string(), Pattern::glob("/foo/bar/*").unwrap().to_string() ) } #[test] fn matches_double_star_prefix() { let g = Pattern::glob("**/b").unwrap(); assert!(g.matches(native_dir_sep("/b").as_str())); assert!(g.matches(native_dir_sep("/a/b").as_str())); } #[test] fn matches_double_star_infix() { let g1 = Pattern::glob("/a/**/c").unwrap(); assert!(g1.matches(native_dir_sep("/a/b1/c").as_str())); assert!(g1.matches(native_dir_sep("/a/b1/b2/c").as_str())); assert!(g1.matches(native_dir_sep("/a/b1/b2/b3/c").as_str())); } #[test] fn ext_glob_optional() { let g = Pattern::glob("/a-?(foo|bar)").unwrap(); assert!(g.matches(native_dir_sep("/a-foo").as_str())); assert!(g.matches(native_dir_sep("/a-bar").as_str())); } #[test] fn ext_glob_many() { let g = Pattern::glob("/a-*(foo|bar)").unwrap(); assert!(g.matches(native_dir_sep("/a-").as_str())); assert!(g.matches(native_dir_sep("/a-foo").as_str())); assert!(g.matches(native_dir_sep("/a-foofoo").as_str())); assert!(g.matches(native_dir_sep("/a-foobar").as_str())); } #[test] fn ext_glob_at_least_one() { let g = Pattern::glob("/a-+(foo|bar)").unwrap(); assert!(!g.matches(native_dir_sep("/a-").as_str())); assert!(g.matches(native_dir_sep("/a-foo").as_str())); assert!(g.matches(native_dir_sep("/a-foofoo").as_str())); assert!(g.matches(native_dir_sep("/a-foobar").as_str())); } #[test] fn ext_glob_nested() { let g = Pattern::glob("/a-@(foo|bar?(baz))").unwrap(); assert!(g.matches(native_dir_sep("/a-foo").as_str())); assert!(g.matches(native_dir_sep("/a-bar").as_str())); assert!(g.matches(native_dir_sep("/a-barbaz").as_str())); assert!(!g.matches(native_dir_sep("/a-foobaz").as_str())); } #[test] fn ext_glob_exactly_one() { let g = Pattern::glob("/a-@(foo|bar)").unwrap(); assert!(!g.matches(native_dir_sep("/a-").as_str())); assert!(g.matches(native_dir_sep("/a-foo").as_str())); assert!(!g.matches(native_dir_sep("/a-foofoo").as_str())); assert!(!g.matches(native_dir_sep("/a-foobar").as_str())); } #[test] fn matches_fully() { let g1 = Pattern::glob("/a/b?/*").unwrap(); assert!(g1.matches(native_dir_sep("/a/b1/c").as_str())); assert!(g1.matches(native_dir_sep("/a/b1/").as_str())); assert!(!g1.matches(native_dir_sep("/a/b1").as_str())); assert!(!g1.matches(native_dir_sep("/a/b/c").as_str())); } #[test] fn matches_partially() { let g1 = Pattern::glob("/a/b/*").unwrap(); assert!(g1.matches_partially(native_dir_sep("/a").as_str())); assert!(g1.matches_partially(native_dir_sep("/a/b").as_str())); assert!(g1.matches_partially(native_dir_sep("/a/b/foo").as_str())); assert!(!g1.matches_partially(native_dir_sep("/b/foo").as_str())); let g2 = Pattern::glob("/a/{b1,b2}/c/*").unwrap(); assert!(g2.matches_partially(native_dir_sep("/a/b1").as_str())); assert!(g2.matches_partially(native_dir_sep("/a/b2").as_str())); assert!(g2.matches_partially(native_dir_sep("/a/b2/c").as_str())); assert!(!g2.matches_partially(native_dir_sep("/b2/c").as_str())); let g3 = Pattern::glob("/a/{b11,b21/b22}/c/*").unwrap(); assert!(g3.matches_partially(native_dir_sep("/a/b11").as_str())); assert!(g3.matches_partially(native_dir_sep("/a/b11/c").as_str())); assert!(g3.matches_partially(native_dir_sep("/a/b21").as_str())); assert!(g3.matches_partially(native_dir_sep("/a/b21/b22").as_str())); assert!(g3.matches_partially(native_dir_sep("/a/b21/b22/c").as_str())); } #[test] fn matches_prefix() { let g1 = Pattern::glob("/a/b/*").unwrap(); assert!(g1.matches_prefix(native_dir_sep("/a/b/c").as_str())); assert!(g1.matches_prefix(native_dir_sep("/a/b/z/foo").as_str())); assert!(!g1.matches_prefix(native_dir_sep("/a/c/z/foo").as_str())); } } 0707010000001A000081A4000000000000000000000001653E86C200000704000000000000000000000000000000000000002400000000fclones-0.34.0/fclones/src/phase.rs/// Identifies a phase of work. /// Used for reporting / progress tracking. #[derive(Clone, Copy, Eq, PartialEq)] pub enum Phase { Walk, FetchExtents, GroupBySize, GroupByPrefix, GroupBySuffix, GroupByContents, TransformAndGroup, } impl Phase { pub fn name(&self) -> &'static str { match self { Phase::Walk => "Scanning files", Phase::FetchExtents => "Fetching extends", Phase::GroupBySize => "Grouping by size", Phase::GroupByPrefix => "Grouping by prefix", Phase::GroupBySuffix => "Grouping by suffix", Phase::GroupByContents => "Grouping by contents", Phase::TransformAndGroup => "Transforming and grouping", } } } /// Represents a sequence of phases. /// Used for progress reporting. pub struct Phases(Vec<Phase>); impl Phases { pub fn new(phases: Vec<Phase>) -> Phases { Phases(phases) } /// Returns a string with the sequential number of the phase and its name. /// Panics if the vector does not contain given phase. pub fn format(&self, phase: Phase) -> String { let phase_no = self.0.iter().position(|p| *p == phase).unwrap(); let phase_count = self.0.len(); format!("{}/{}: {}", phase_no + 1, phase_count, phase.name()) } } #[cfg(test)] mod test { use crate::phase::{Phase, Phases}; #[test] fn format_phase() { let phases = Phases(vec![ Phase::Walk, Phase::GroupBySize, Phase::GroupByPrefix, Phase::GroupBySuffix, Phase::GroupByContents, ]); assert_eq!("1/5: Scanning files", phases.format(Phase::Walk)); assert_eq!("2/5: Grouping by size", phases.format(Phase::GroupBySize)); } } 0707010000001B000081A4000000000000000000000001653E86C2000021A6000000000000000000000000000000000000002700000000fclones-0.34.0/fclones/src/progress.rs//! Fast, concurrent, lockless progress bars. use crate::FileLen; use console::style; use status_line::{Options, StatusLine}; use std::fmt::{Display, Formatter}; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::Instant; /// Common interface for components that can show progress of a task. E.g. progress bars. pub trait ProgressTracker: Sync + Send { fn inc(&self, delta: u64); } /// A progress bar that doesn't display itself and does nothing. /// This exists purely because sometimes there is an operation that needs to report progress, /// but we don't want to show it to the user. pub struct NoProgressBar; impl ProgressTracker for NoProgressBar { fn inc(&self, _delta: u64) {} } #[derive(Debug, Default)] enum ProgressUnit { #[default] Item, Bytes, } /// Keeps state of the progress bar and controls how it is rendered to a string #[derive(Debug)] struct Progress { msg: String, // message shown before the progress bar value: AtomicU64, // controls the length of the progress bar max: Option<u64>, // maximum expected value, if not set an animated spinner is shown unit: ProgressUnit, // how to format the numbers start_time: Instant, // needed for the animation color: bool, } impl Progress { fn fmt_value(&self, value: u64) -> String { match self.unit { ProgressUnit::Item => value.to_string(), ProgressUnit::Bytes => FileLen(value).to_string(), } } /// Draws the progress bar alone (without message and numbers) fn bar(&self, length: usize) -> String { let mut bar = "=".repeat(length); if !bar.is_empty() { bar.pop(); bar.push('>'); } bar.truncate(MAX_BAR_LEN); bar } fn animate_spinner(&self, frame: u64) -> String { let spaceship = "<===>"; let max_pos = (MAX_BAR_LEN - spaceship.len()) as u64; let pos = ((frame + max_pos) % (max_pos * 2)).abs_diff(max_pos); assert!(pos < MAX_BAR_LEN as u64); " ".repeat(pos as usize) + spaceship } } impl Default for Progress { fn default() -> Self { Progress { msg: "".to_owned(), value: AtomicU64::default(), max: None, unit: ProgressUnit::default(), start_time: Instant::now(), color: true, } } } const MAX_BAR_LEN: usize = 50; impl Display for Progress { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { let value = self.value.load(Ordering::Relaxed); let value_str = self.fmt_value(value); let msg = if self.color { style(self.msg.clone()).for_stderr().cyan().bold() } else { style(self.msg.clone()) }; match self.max { Some(max) => { let max_str = self.fmt_value(max); let bar_len = (MAX_BAR_LEN as u64 * value / max.max(1)) as usize; let bar = self.bar(bar_len); write!(f, "{msg:32}[{bar:MAX_BAR_LEN$}]{value_str:>14} / {max_str}") } None => { let frame = (self.start_time.elapsed().as_millis() / 50) as u64; let bar = self.animate_spinner(frame); write!(f, "{msg:32}[{bar:MAX_BAR_LEN$}]{value_str:>14}") } } } } /// Console-based progress bar that renders to standard error. pub struct ProgressBar { status_line: StatusLine<Progress>, } impl ProgressBar { /// Create a new preconfigured animated spinner with given message. pub fn new_spinner(msg: &str) -> ProgressBar { let progress = Progress { msg: msg.to_string(), ..Default::default() }; ProgressBar { status_line: StatusLine::new(progress), } } /// Create a new preconfigured progress bar with given message. pub fn new_progress_bar(msg: &str, len: u64) -> ProgressBar { let progress = Progress { msg: msg.to_string(), max: Some(len), ..Default::default() }; ProgressBar { status_line: StatusLine::new(progress), } } /// Create a new preconfigured progress bar with given message. /// Displays progress in bytes. pub fn new_bytes_progress_bar(msg: &str, len: u64) -> ProgressBar { let progress = Progress { msg: msg.to_string(), max: Some(len), unit: ProgressUnit::Bytes, ..Default::default() }; ProgressBar { status_line: StatusLine::new(progress), } } /// Creates a new invisible progress bar. /// This is useful when you need to disable progress bar, but you need to pass an instance /// of a `ProgressBar` to something that expects it. pub fn new_hidden() -> ProgressBar { ProgressBar { status_line: StatusLine::with_options( Progress::default(), Options { refresh_period: Default::default(), initially_visible: false, enable_ansi_escapes: false, }, ), } } pub fn is_visible(&self) -> bool { self.status_line.is_visible() } pub fn eprintln<I: AsRef<str>>(&self, msg: I) { let was_visible = self.status_line.is_visible(); self.status_line.set_visible(false); eprintln!("{}", msg.as_ref()); self.status_line.set_visible(was_visible); } pub fn tick(&self) { self.status_line.value.fetch_add(1, Ordering::Relaxed); } pub fn finish_and_clear(&self) { self.status_line.set_visible(false); } } impl ProgressTracker for ProgressBar { fn inc(&self, delta: u64) { self.status_line.value.fetch_add(delta, Ordering::Relaxed); } } #[cfg(test)] mod test { use crate::progress::{Progress, ProgressUnit}; use crate::regex::Regex; use std::sync::atomic::{AtomicU64, Ordering}; #[test] fn draw_progress_bar() { let p = Progress { msg: "Message".to_string(), max: Some(100), color: false, ..Default::default() }; assert_eq!(p.to_string(), "Message [ ] 0 / 100"); p.value.fetch_add(2, Ordering::Relaxed); assert_eq!(p.to_string(), "Message [> ] 2 / 100"); p.value.fetch_add(50, Ordering::Relaxed); assert_eq!(p.to_string(), "Message [=========================> ] 52 / 100"); p.value.fetch_add(48, Ordering::Relaxed); assert_eq!(p.to_string(), "Message [=================================================>] 100 / 100"); } #[test] fn draw_progress_bar_bytes() { let p = Progress { msg: "Message".to_string(), max: Some(1000000000), value: AtomicU64::new(12000), unit: ProgressUnit::Bytes, color: false, ..Default::default() }; assert_eq!(p.to_string(), "Message [ ] 12.0 KB / 1000.0 MB"); } #[test] fn animate_spinner() { let p = Progress { msg: "Message".to_string(), color: false, ..Default::default() }; let pattern = Regex::new( "^Message \\[ *<===> *\\] 0$", false, ) .unwrap(); let s = p.to_string(); assert!( pattern.is_match(s.as_str()), "Spinner doesn't match pattern: {}", s ); assert_eq!(p.animate_spinner(0), "<===>"); assert_eq!(p.animate_spinner(1), " <===>"); assert_eq!(p.animate_spinner(2), " <===>"); assert_eq!(p.animate_spinner(3), " <===>"); assert_eq!(p.animate_spinner(85), " <===>"); assert_eq!(p.animate_spinner(86), " <===>"); assert_eq!(p.animate_spinner(87), " <===>"); assert_eq!(p.animate_spinner(88), " <===>"); assert_eq!(p.animate_spinner(89), " <===>"); assert_eq!(p.animate_spinner(90), "<===>"); assert_eq!(p.animate_spinner(91), " <===>"); assert_eq!(p.animate_spinner(92), " <===>"); } } 0707010000001C000081A4000000000000000000000001653E86C2000043A3000000000000000000000000000000000000002600000000fclones-0.34.0/fclones/src/reflink.rsuse std::fs; use std::fs::Metadata; use std::io; use filetime::FileTime; use crate::dedupe::{FsCommand, PathAndMetadata}; use crate::log::{Log, LogExt}; #[cfg(unix)] struct XAttr { name: std::ffi::OsString, value: Option<Vec<u8>>, } /// Calls OS-specific reflink implementations with an option to call the more generic /// one during testing one on Linux ("crosstesting"). /// The destination file is allowed to exist. pub fn reflink(src: &PathAndMetadata, dest: &PathAndMetadata, log: &dyn Log) -> io::Result<()> { // Remember original metadata of the parent directory: let dest_parent = dest.path.parent(); let dest_parent_metadata = dest_parent.map(|p| p.to_path_buf().metadata()); // Call reflink: let result = || -> io::Result<()> { let dest_path_buf = dest.path.to_path_buf(); if cfg!(any(target_os = "linux", target_os = "android")) && !crosstest() { linux_reflink(src, dest, log)?; restore_metadata(&dest_path_buf, &dest.metadata, Restore::TimestampOnly) } else { #[cfg(unix)] let dest_xattrs = get_xattrs(&dest_path_buf)?; safe_reflink(src, dest, log)?; #[cfg(unix)] restore_xattrs(&dest_path_buf, dest_xattrs)?; restore_metadata( &dest_path_buf, &dest.metadata, Restore::TimestampOwnersPermissions, ) } }() .map_err(|e| { io::Error::new( e.kind(), format!("Failed to deduplicate {dest} -> {src}: {e}"), ) }); // Restore the original metadata of the deduplicated files's parent directory: if let Some(parent) = dest_parent { if let Some(metadata) = dest_parent_metadata { let result = metadata.and_then(|metadata| { restore_metadata(&parent.to_path_buf(), &metadata, Restore::TimestampOnly) }); if let Err(e) = result { log.warn(format!( "Failed keep metadata for {}: {}", parent.display(), e )) } } } result } // Dummy function so tests compile #[cfg(not(any(target_os = "linux", target_os = "android")))] fn linux_reflink( _target: &PathAndMetadata, _link: &PathAndMetadata, _log: &dyn Log, ) -> io::Result<()> { unreachable!() } // First reflink (not move) the target file out of the way (this also checks for // reflink support), then overwrite the existing file to preserve most metadata and xattrs. #[cfg(any(target_os = "linux", target_os = "android"))] fn linux_reflink(src: &PathAndMetadata, dest: &PathAndMetadata, log: &dyn Log) -> io::Result<()> { let tmp = FsCommand::temp_file(&dest.path); let std_tmp = tmp.to_path_buf(); let fs_target = src.path.to_path_buf(); let std_link = dest.path.to_path_buf(); let remove_temporary = |temporary| { if let Err(e) = FsCommand::remove(&temporary) { log.warn(format!( "Failed to remove temporary {}: {}", temporary.display(), e )) } }; // Backup via reflink, if this fails then the fs does not support reflinking. if let Err(e) = reflink_overwrite(&std_link, &std_tmp) { remove_temporary(tmp); return Err(e); } match reflink_overwrite(&fs_target, &std_link) { Err(e) => { if let Err(remove_err) = FsCommand::unsafe_rename(&tmp, &dest.path) { log.warn(format!( "Failed to undo deduplication from {} to {}: {}", &dest, tmp.display(), remove_err )) } Err(e) } Ok(ok) => { remove_temporary(tmp); Ok(ok) } } } /// Reflink `target` to `link` and expect these two files to be equally sized. #[cfg(any(target_os = "linux", target_os = "android"))] fn reflink_overwrite(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> { use nix::request_code_write; use std::os::unix::prelude::AsRawFd; let src = fs::File::open(target)?; // This operation does not require `.truncate(true)` because the files are already of the same size. let dest = fs::OpenOptions::new().create(true).write(true).open(link)?; // From /usr/include/linux/fs.h: // #define FICLONE _IOW(0x94, 9, int) const FICLONE_TYPE: u8 = 0x94; const FICLONE_NR: u8 = 9; const FICLONE_SIZE: usize = std::mem::size_of::<libc::c_int>(); let ret = unsafe { libc::ioctl( dest.as_raw_fd(), request_code_write!(FICLONE_TYPE, FICLONE_NR, FICLONE_SIZE), src.as_raw_fd(), ) }; #[allow(clippy::if_same_then_else)] if ret == -1 { let err = io::Error::last_os_error(); let code = err.raw_os_error().unwrap(); // unwrap () Ok, created from `last_os_error()` if code == libc::EOPNOTSUPP { // 95 // Filesystem does not supported reflinks. // No cleanup required, file is left untouched. } else if code == libc::EINVAL { // 22 // Source filesize was larger than destination. } Err(err) } else { Ok(()) } } /// Restores file owner and group #[cfg(unix)] fn restore_owner(path: &std::path::Path, metadata: &Metadata) -> io::Result<()> { use file_owner::PathExt; use std::os::unix::fs::MetadataExt; let uid = metadata.uid(); let gid = metadata.gid(); path.set_group(gid).map_err(|e| { io::Error::new( io::ErrorKind::Other, format!("Failed to set file group of {}: {}", path.display(), e), ) })?; path.set_owner(uid).map_err(|e| { io::Error::new( io::ErrorKind::Other, format!("Failed to set file owner of {}: {}", path.display(), e), ) })?; Ok(()) } #[derive(Debug, PartialEq)] enum Restore { TimestampOnly, TimestampOwnersPermissions, } // Not kept: xattrs, ACLs, etc. fn restore_metadata( path: &std::path::Path, metadata: &Metadata, restore: Restore, ) -> io::Result<()> { let atime = FileTime::from_last_access_time(metadata); let mtime = FileTime::from_last_modification_time(metadata); filetime::set_file_times(path, atime, mtime).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to set access and modification times for {}: {}", path.display(), e ), ) })?; if restore == Restore::TimestampOwnersPermissions { fs::set_permissions(path, metadata.permissions()).map_err(|e| { io::Error::new( e.kind(), format!("Failed to set permissions for {}: {}", path.display(), e), ) })?; #[cfg(unix)] restore_owner(path, metadata)?; } Ok(()) } #[cfg(unix)] fn get_xattrs(path: &std::path::Path) -> io::Result<Vec<XAttr>> { use itertools::Itertools; use xattr::FileExt; let file = fs::File::open(path)?; file.list_xattr() .map_err(|e| { io::Error::new( e.kind(), format!( "Failed to list extended attributes of {}: {}", path.display(), e ), ) })? .map(|name| { Ok(XAttr { value: file.get_xattr(name.as_os_str()).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to read extended attribute {} of {}: {}", name.to_string_lossy(), path.display(), e ), ) })?, name, }) }) .try_collect() } #[cfg(unix)] fn restore_xattrs(path: &std::path::Path, xattrs: Vec<XAttr>) -> io::Result<()> { use xattr::FileExt; let file = fs::File::open(path)?; for name in file.list_xattr()? { file.remove_xattr(&name).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to clear extended attribute {} of {}: {}", name.to_string_lossy(), path.display(), e ), ) })?; } for attr in xattrs { if let Some(value) = attr.value { file.set_xattr(&attr.name, &value).map_err(|e| { io::Error::new( e.kind(), format!( "Failed to set extended attribute {} of {}: {}", attr.name.to_string_lossy(), path.display(), e ), ) })?; } } Ok(()) } // Reflink which expects the destination to not exist. #[cfg(any(not(any(target_os = "linux", target_os = "android")), test))] fn copy_by_reflink(src: &crate::path::Path, dest: &crate::path::Path) -> io::Result<()> { reflink::reflink(src.to_path_buf(), dest.to_path_buf()) .map_err(|e| io::Error::new(e.kind(), format!("Failed to reflink: {e}"))) } // Create a reflink by removing the file and making a reflink copy of the original. // After successful copy, attempts to restore the metadata of the file. // If reflink or metadata restoration fails, moves the original file back to its original place. #[cfg(any(not(any(target_os = "linux", target_os = "android")), test))] fn safe_reflink(src: &PathAndMetadata, dest: &PathAndMetadata, log: &dyn Log) -> io::Result<()> { FsCommand::safe_remove( &dest.path, move |link| { copy_by_reflink(&src.path, link)?; Ok(()) }, log, ) } // Dummy function so non-test cfg compiles #[cfg(not(any(not(any(target_os = "linux", target_os = "android")), test)))] fn safe_reflink(_src: &PathAndMetadata, _dest: &PathAndMetadata, _log: &dyn Log) -> io::Result<()> { unreachable!() } #[cfg(not(test))] pub const fn crosstest() -> bool { false } #[cfg(test)] pub fn crosstest() -> bool { test::cfg::crosstest() } #[cfg(test)] pub mod test { pub mod cfg { // Helpers to switch reflink implementations when running tests // and to ensure only one reflink test runs at a time. use std::sync::{Mutex, MutexGuard}; use lazy_static::lazy_static; lazy_static! { pub static ref CROSSTEST: Mutex<bool> = Mutex::new(false); pub static ref SEQUENTIAL_REFLINK_TESTS: Mutex<()> = Mutex::default(); } pub struct CrossTest<'a>(MutexGuard<'a, ()>); impl<'a> CrossTest<'a> { pub fn new(crosstest: bool) -> CrossTest<'a> { let x = CrossTest(SEQUENTIAL_REFLINK_TESTS.lock().unwrap()); *CROSSTEST.lock().unwrap() = crosstest; x } } impl<'a> Drop for CrossTest<'a> { fn drop(&mut self) { *CROSSTEST.lock().unwrap() = false; } } pub fn crosstest() -> bool { *CROSSTEST.lock().unwrap() } } use crate::log::StdLog; use std::sync::Arc; use crate::util::test::{cached_reflink_supported, read_file, with_dir, write_file}; use super::*; use crate::path::Path as FcPath; // Usually /dev/shm only exists on Linux. #[cfg(target_os = "linux")] fn test_reflink_command_fails_on_dev_shm_tmpfs() { // No `cached_reflink_supported()` check if !std::path::Path::new("/dev/shm").is_dir() { println!(" Notice: strange Linux without /dev/shm, can't test reflink failure"); return; } let test_root = "/dev/shm/tmp.fclones.reflink.testfailure"; // Usually /dev/shm is mounted as a tmpfs which does not support reflinking, so test there. with_dir(test_root, |root| { // Always clean up files in /dev/shm, even after failure struct CleanupGuard<'a>(&'a str); impl<'a> Drop for CleanupGuard<'a> { fn drop(&mut self) { fs::remove_dir_all(self.0).unwrap(); } } let _guard = CleanupGuard(test_root); let log = StdLog::new(); let file_path_1 = root.join("file_1"); let file_path_2 = root.join("file_2"); write_file(&file_path_1, "foo"); write_file(&file_path_2, "foo"); let file_1 = PathAndMetadata::new(FcPath::from(&file_path_1)).unwrap(); let file_2 = PathAndMetadata::new(FcPath::from(&file_path_2)).unwrap(); let cmd = FsCommand::RefLink { target: Arc::new(file_1), link: file_2, }; assert!( cmd.execute(true, &log) .unwrap_err() .to_string() .starts_with("Failed to deduplicate"), "Reflink did not fail on /dev/shm (tmpfs), or this mount now supports reflinking" ); assert!(file_path_2.exists()); assert_eq!(read_file(&file_path_2), "foo"); }) } #[test] #[cfg(target_os = "linux")] fn test_reflink_command_failure() { { let _sequential = cfg::CrossTest::new(false); test_reflink_command_fails_on_dev_shm_tmpfs(); } { let _sequential = cfg::CrossTest::new(true); test_reflink_command_fails_on_dev_shm_tmpfs(); } } fn test_reflink_command_with_file_too_large(via_ioctl: bool) { if !cached_reflink_supported() { return; } with_dir("dedupe/reflink_too_large", |root| { let log = StdLog::new(); let file_path_1 = root.join("file_1"); let file_path_2 = root.join("file_2"); write_file(&file_path_1, "foo"); write_file(&file_path_2, "too large"); let file_1 = PathAndMetadata::new(FcPath::from(&file_path_1)).unwrap(); let file_2 = PathAndMetadata::new(FcPath::from(&file_path_2)).unwrap(); let cmd = FsCommand::RefLink { target: Arc::new(file_1), link: file_2, }; if via_ioctl { assert!(cmd .execute(true, &log) .unwrap_err() .to_string() .starts_with("Failed to deduplicate")); assert!(file_path_1.exists()); assert!(file_path_2.exists()); assert_eq!(read_file(&file_path_1), "foo"); assert_eq!(read_file(&file_path_2), "too large"); } else { cmd.execute(true, &log).unwrap(); assert!(file_path_2.exists()); assert_eq!(read_file(&file_path_2), "foo"); } }) } #[test] fn test_reflink_command_works_with_files_too_large_anyos() { let _sequential = cfg::CrossTest::new(true); test_reflink_command_with_file_too_large(false); } // This tests the reflink code path (using the reflink crate) usually not used on Linux. #[test] #[cfg(any(target_os = "linux", target_os = "android"))] fn test_reflink_command_fails_with_files_too_large_using_ioctl_linux() { let _sequential = cfg::CrossTest::new(false); test_reflink_command_with_file_too_large(true); } fn test_reflink_command_fills_file_with_content() { if !cached_reflink_supported() { return; } with_dir("dedupe/reflink_test", |root| { let log = StdLog::new(); let file_path_1 = root.join("file_1"); let file_path_2 = root.join("file_2"); write_file(&file_path_1, "foo"); write_file(&file_path_2, "f"); let file_1 = PathAndMetadata::new(FcPath::from(&file_path_1)).unwrap(); let file_2 = PathAndMetadata::new(FcPath::from(&file_path_2)).unwrap(); let cmd = FsCommand::RefLink { target: Arc::new(file_1), link: file_2, }; cmd.execute(true, &log).unwrap(); assert!(file_path_1.exists()); assert!(file_path_2.exists()); assert_eq!(read_file(&file_path_2), "foo"); }) } #[test] fn test_reflink_command_fills_file_with_content_anyos() { let _sequential = cfg::CrossTest::new(false); test_reflink_command_fills_file_with_content(); } // This tests the reflink code path (using the reflink crate) usually not used on Linux. #[test] #[cfg(any(target_os = "linux", target_os = "android"))] fn test_reflink_command_fills_file_with_content_not_ioctl_linux() { let _sequential = cfg::CrossTest::new(true); test_reflink_command_fills_file_with_content(); } } 0707010000001D000081A4000000000000000000000001653E86C2000012DC000000000000000000000000000000000000002400000000fclones-0.34.0/fclones/src/regex.rsuse std::cmp::min; /// Adds poor-man's partial matching support to the standard regex::Regex /// Note this is very limited and slightly broken stub for partial matching. /// False positives for partial matching are allowed. #[derive(Clone, Debug)] pub struct Regex { regex: regex::Regex, fixed_prefix: String, case_insensitive: bool, } impl Regex { pub fn new(re: &str, case_insensitive: bool) -> Result<Regex, regex::Error> { assert!(re.starts_with('^')); let regex = regex::RegexBuilder::new(re) .case_insensitive(case_insensitive) .build()?; let fixed_prefix = if case_insensitive { Self::get_fixed_prefix(re).to_lowercase() } else { Self::get_fixed_prefix(re) }; Ok(Regex { regex, fixed_prefix, case_insensitive, }) } pub fn is_match(&self, s: &str) -> bool { self.regex.is_match(s) } /// Returns true if given string `s` could match the pattern if extended /// by more characters. /// /// Technically it checks if the string s matches the initial characters /// in the fixed prefix of the regex, where fixed prefix are all characters up to the /// first regex wildcard. pub fn is_partial_match(&self, s: &str) -> bool { let len = min(s.len(), self.fixed_prefix.len()); let truncated: String = s.chars().take(len).collect(); let pattern = if self.case_insensitive { truncated.to_lowercase() } else { truncated }; self.fixed_prefix.starts_with(&pattern) } /// Returns the initial fragment of the regex string that always matches /// a fixed string. That fragment does not contain any wildcard characters (or all are escaped). fn get_fixed_prefix(s: &str) -> String { let mut escape = false; let mut result = String::new(); let magic_chars = ['.', '^', '$', '(', ')', '{', '}', '[', ']', '|', '.', '+']; for (i, c) in s.chars().enumerate() { if c == '^' && i == 0 { continue; } if magic_chars.contains(&c) && !escape { break; } // these may make the previous character optional, // so we erase the last added one if ['?', '*'].contains(&c) && !escape { result = result.chars().take(result.len() - 1).collect(); break; } // escaped alphabetic character means a character class, // so let's stop here as well if c.is_ascii_alphabetic() && escape { break; } // we\re not adding the escape char to the output, because the output is not a regexp if c == '\\' { escape = true; continue; } result.push(c); } result } } #[cfg(test)] mod test { use super::*; #[test] fn test_simple_text_is_passed_as_is() { let r = Regex::new("^foo/BAR", false).unwrap(); assert!(r.is_match("foo/BAR")); assert!(r.is_partial_match("foo/BAR")); assert!(!r.is_match("foo/bar")); assert!(!r.is_partial_match("foo/bar")); } #[test] fn test_case_insensitive() { let r = Regex::new("^foo/BAR", true).unwrap(); assert!(r.is_match("foo/BAR")); assert!(r.is_partial_match("foo/BAR")); assert!(r.is_match("foo/bar")); assert!(r.is_partial_match("foo/bar")); assert!(!r.is_partial_match("foo/baz")); } #[test] fn test_partial_match_stops_on_wildcard() { let r = Regex::new("^abcd*ef", true).unwrap(); assert!(r.is_match("abcdddddef")); assert!(r.is_match("abcef")); assert!(!r.is_match("abef")); assert!(r.is_partial_match("a")); assert!(r.is_partial_match("ab")); assert!(r.is_partial_match("ab")); assert!(r.is_partial_match("abc")); assert!(r.is_partial_match("abcef")); assert!(!r.is_partial_match("-a")); assert!(!r.is_partial_match("a-b")); assert!(!r.is_partial_match("ab-")); } #[test] fn test_unicode() { let r = Regex::new("^ąęść?", true).unwrap(); assert!(r.is_partial_match("Ä…")); assert!(r.is_partial_match("Ä…Ä™")); assert!(r.is_partial_match("ąęś")); assert!(r.is_partial_match("ąęść")); assert!(!r.is_partial_match("ąęść---")); } #[test] fn test_can_partial_match_escaped_chars() { let r = Regex::new("^\\.\\*", true).unwrap(); assert!(r.is_partial_match(".")); assert!(r.is_partial_match(".*")); assert!(!r.is_partial_match("foo")); } } 0707010000001E000081A4000000000000000000000001653E86C2000083D6000000000000000000000000000000000000002500000000fclones-0.34.0/fclones/src/report.rs//! Output formatting. use std::cell::Cell; use std::cmp::min; use std::io; use std::io::{BufRead, BufReader, Error, ErrorKind, Read, Write}; use std::str::FromStr; use chrono::{DateTime, FixedOffset}; use console::style; use fallible_iterator::FallibleIterator; use itertools::Itertools; use lazy_static::lazy_static; use regex::Regex; use serde::{Deserialize, Serialize}; use crate::arg; use crate::arg::Arg; use crate::config::OutputFormat; use crate::file::{FileHash, FileLen}; use crate::group::FileGroup; use crate::path::Path; use crate::util::IteratorWrapper; use crate::TIMESTAMP_FMT; /// Describes how many redundant files were found, in how many groups, /// how much space can be reclaimed, etc. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] pub struct FileStats { pub group_count: usize, pub total_file_count: usize, pub total_file_size: FileLen, pub redundant_file_count: usize, pub redundant_file_size: FileLen, pub missing_file_count: usize, pub missing_file_size: FileLen, } /// Data in the header of the whole report. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] pub struct ReportHeader { /// The program version that produced the report pub version: String, /// The date and time when the report was produced pub timestamp: DateTime<FixedOffset>, /// Full shell command containing arguments of the search run that produced the report pub command: Vec<Arg>, /// Working directory where the fclones command was executed pub base_dir: Path, /// Information on the number of duplicate files reported. /// This is optional to allow streaming the report out before finding all files in the future. pub stats: Option<FileStats>, } /// A helper struct that allows to serialize the report with serde. /// Together with `IteratorWrapper` used as `groups` it allows to serialize /// a report in a streaming way, without the need to keep all groups in memory at once. #[derive(Serialize)] struct SerializableReport<'a, G: Serialize> { header: &'a ReportHeader, groups: G, } /// A structure for holding contents of the report after fully deserializing the report. /// Used only by report readers that deserialize the whole report at once. /// Paths are represented as strings, because strings are more memory efficient than Path here, /// because we can't do prefix compression that `Path` was designed for. #[derive(Deserialize)] struct DeserializedReport { header: ReportHeader, groups: Vec<FileGroup<String>>, } /// Formats and writes duplicate files report to a stream. /// Supports many formats: text, csv, json, etc. pub struct ReportWriter<W: Write> { out: W, color: bool, } impl<W: Write> ReportWriter<W> { pub fn new(out: W, color: bool) -> ReportWriter<W> { ReportWriter { out, color } } fn write_header_line(&mut self, line: &str) -> io::Result<()> { writeln!( self.out, "{}", style(format!("# {line}")).cyan().force_styling(self.color) ) } /// Writes the report in human-readable text format. /// /// A group of identical files starts with a group header at column 0, /// containing the size and hash of each file in the group. /// Then file paths are printed in separate, indented lines. /// /// # Example /// ```text /// # Report by fclones 0.18.0 /// # Timestamp: 2022-03-18 08:22:00.844 +0100 /// # Command: fclones group . /// # Base dir: /home/pkolaczk/Projekty/fclones /// # Total: 13589 B (13.6 KB) in 31 files in 14 groups /// # Redundant: 6819 B (6.8 KB) in 17 files /// # Missing: 0 B (0 B) in 0 files /// 49165422e775f631cca3b09124f8ee89, 6274 B (6.3 KB) * 2: /// /home/pkolaczk/Projekty/fclones/src/semaphore.rs /// /home/pkolaczk/Projekty/fclones/.git/rr-cache/d5cde6e71942982e722d6dfe41936c9036ba9f4b/postimage /// dcf2e11190ccc260f2388d9a5a2ed20e, 41 B (41 B) * 2: /// /home/pkolaczk/Projekty/fclones/.git/refs/heads/diff_roots /// /home/pkolaczk/Projekty/fclones/.git/refs/remotes/origin/diff_roots /// d0521f268e17c28b10c48e5f5de48f21, 41 B (41 B) * 2: /// /home/pkolaczk/Projekty/fclones/.git/refs/heads/fix_flock_freebsd /// /home/pkolaczk/Projekty/fclones/.git/refs/remotes/origin/fix_flock_freebsd /// ... /// ``` pub fn write_as_text<I, G, P>(&mut self, header: &ReportHeader, groups: I) -> io::Result<()> where I: IntoIterator<Item = G>, G: AsRef<FileGroup<P>>, P: AsRef<Path>, { let command = arg::join(&header.command); self.write_header_line(&format!("Report by fclones {}", header.version))?; self.write_header_line(&format!( "Timestamp: {}", header.timestamp.format(TIMESTAMP_FMT) ))?; self.write_header_line(&format!("Command: {command}"))?; self.write_header_line(&format!( "Base dir: {}", header.base_dir.to_escaped_string() ))?; if let Some(stats) = &header.stats { self.write_header_line(&format!( "Total: {} B ({}) in {} files in {} groups", stats.total_file_size.0, stats.total_file_size, stats.total_file_count, stats.group_count ))?; self.write_header_line(&format!( "Redundant: {} B ({}) in {} files", stats.redundant_file_size.0, stats.redundant_file_size, stats.redundant_file_count ))?; self.write_header_line(&format!( "Missing: {} B ({}) in {} files", stats.missing_file_size.0, stats.missing_file_size, stats.missing_file_count ))?; } for g in groups { let g = g.as_ref(); let group_header = format!( "{}, {} B ({}) * {}:", g.file_hash, g.file_len.0, g.file_len, g.files.len() ); let group_header = style(group_header).yellow(); writeln!(self.out, "{}", group_header.force_styling(self.color),)?; for f in g.files.iter() { writeln!(self.out, " {}", f.as_ref().to_escaped_string())?; } } Ok(()) } /// Writes the report in `fdupes` compatible format. /// This is very similar to the TEXT format, but there are no headers /// for each group, and groups are separated with empty lines. pub fn write_as_fdupes<I, G, P>(&mut self, _header: &ReportHeader, groups: I) -> io::Result<()> where I: IntoIterator<Item = G>, G: AsRef<FileGroup<P>>, P: AsRef<Path>, { for g in groups { let g = g.as_ref(); for f in g.files.iter() { writeln!(self.out, "{}", f.as_ref().to_escaped_string())?; } writeln!(self.out)?; } Ok(()) } /// Writes results in CSV format. /// /// Each file group is written as one line. /// The number of columns is dynamic. /// Columns: /// - file size in bytes /// - file hash (may be empty) /// - number of files in the group /// - file paths - each file in a separate column pub fn write_as_csv<I, G, P>(&mut self, _header: &ReportHeader, groups: I) -> io::Result<()> where I: IntoIterator<Item = G>, G: AsRef<FileGroup<P>>, P: AsRef<Path>, { let mut wtr = csv::WriterBuilder::new() .delimiter(b',') .quote_style(csv::QuoteStyle::Necessary) .flexible(true) .from_writer(&mut self.out); wtr.write_record(["size", "hash", "count", "files"])?; for g in groups { let g = g.as_ref(); let mut record = csv::StringRecord::new(); record.push_field(g.file_len.0.to_string().as_str()); record.push_field(g.file_hash.to_string().as_str()); record.push_field(g.files.len().to_string().as_str()); for f in g.files.iter() { record.push_field(f.as_ref().to_escaped_string().as_ref()); } wtr.write_record(&record)?; } wtr.flush() } /// Writes results as JSON. /// # Example output /// ```json /// { /// "header": { /// "version": "0.18.0", /// "timestamp": "2022-03-18T08:24:28.793228077+01:00", /// "command": [ /// "fclones", /// "group", /// ".", /// "-f", /// "json" /// ], /// "base_dir": "/home/pkolaczk/Projekty/fclones", /// "stats": { /// "group_count": 14, /// "total_file_count": 31, /// "total_file_size": 13589, /// "redundant_file_count": 17, /// "redundant_file_size": 6819, /// "missing_file_count": 0, /// "missing_file_size": 0 /// } /// }, /// "groups": [ /// { /// "file_len": 6274, /// "file_hash": "49165422e775f631cca3b09124f8ee89", /// "files": [ /// "/home/pkolaczk/Projekty/fclones/src/semaphore.rs", /// "/home/pkolaczk/Projekty/fclones/.git/rr-cache/d5cde6e71942982e722d6dfe41936c9036ba9f4b/postimage" /// ] /// }, /// { /// "file_len": 41, /// "file_hash": "dcf2e11190ccc260f2388d9a5a2ed20e", /// "files": [ /// "/home/pkolaczk/Projekty/fclones/.git/refs/heads/diff_roots", /// "/home/pkolaczk/Projekty/fclones/.git/refs/remotes/origin/diff_roots" /// ] /// }, /// ``` pub fn write_as_json<I, G, P>(&mut self, header: &ReportHeader, groups: I) -> io::Result<()> where I: IntoIterator<Item = G>, G: AsRef<FileGroup<P>>, P: AsRef<Path>, { let groups = groups.into_iter().map(|g| FileGroup { file_len: g.as_ref().file_len, file_hash: g.as_ref().file_hash.clone(), files: g .as_ref() .files .iter() .map(|f| f.as_ref().clone()) .collect(), }); let report = SerializableReport { header, groups: IteratorWrapper(Cell::new(Some(groups))), }; serde_json::to_writer_pretty(&mut self.out, &report)?; Ok(()) } /// Writes the report in the format given by `format` parameter. pub fn write<I, G, P>( &mut self, format: OutputFormat, header: &ReportHeader, groups: I, ) -> io::Result<()> where I: IntoIterator<Item = G>, G: AsRef<FileGroup<P>>, P: AsRef<Path>, { match format { OutputFormat::Default => self.write_as_text(header, groups), OutputFormat::Fdupes => self.write_as_fdupes(header, groups), OutputFormat::Csv => self.write_as_csv(header, groups), OutputFormat::Json => self.write_as_json(header, groups), } } } /// Iterator over groups of files, read form the report pub type GroupIterator = dyn FallibleIterator<Item = FileGroup<Path>, Error = io::Error> + Send; /// Reads a report from a stream. pub trait ReportReader { /// Reads the header. Must be called exactly once before reading the groups. /// Reports an io::Error with ErrorKind::InvalidData /// if the report header is malformed. fn read_header(&mut self) -> io::Result<ReportHeader>; /// Opens an iterator over groups. fn read_groups(self: Box<Self>) -> io::Result<Box<GroupIterator>>; } /// Iterates the contents of the report. /// Each emitted item is a group of duplicate files. pub struct TextReportIterator<R: BufRead> { stream: R, line_buf: String, stopped_on_error: bool, } /// Helper struct to encapsulate the data in the header before each group of identical files #[derive(Debug, Eq, PartialEq, Serialize)] struct GroupHeader { count: usize, file_len: FileLen, file_hash: FileHash, } impl<R> TextReportIterator<R> where R: BufRead, { fn new(input: R) -> TextReportIterator<R> { TextReportIterator { stream: input, line_buf: String::new(), stopped_on_error: false, } } fn read_first_non_comment_line(&mut self) -> io::Result<Option<&str>> { loop { self.line_buf.clear(); self.stream.read_line(&mut self.line_buf)?; let line = self.line_buf.trim(); if line.is_empty() { return Ok(None); } if !line.starts_with('#') { break; } } Ok(Some(self.line_buf.trim())) } fn read_group_header(&mut self) -> io::Result<Option<GroupHeader>> { let header_str = match self.read_first_non_comment_line()? { None => return Ok(None), Some(s) => s, }; lazy_static! { static ref GROUP_HEADER_RE: Regex = Regex::new(r"^([a-f0-9]+), ([0-9]+) B [^*]* \* ([0-9]+):").unwrap(); } let captures = GROUP_HEADER_RE.captures(header_str).ok_or_else(|| { Error::new( ErrorKind::InvalidData, format!("Malformed group header: {header_str}"), ) })?; Ok(Some(GroupHeader { file_hash: FileHash::from_str(captures.get(1).unwrap().as_str()).unwrap(), file_len: FileLen(captures.get(2).unwrap().as_str().parse::<u64>().unwrap()), count: captures.get(3).unwrap().as_str().parse::<usize>().unwrap(), })) } fn read_paths(&mut self, count: usize) -> io::Result<Vec<Path>> { let mut paths = Vec::with_capacity(min(count, 1024)); for _ in 0..count { self.line_buf.clear(); let n = self.stream.read_line(&mut self.line_buf)?; let path_str = &self.line_buf; if n == 0 { return Err(Error::new( ErrorKind::UnexpectedEof, "Unexpected end of file.", )); } if !path_str.starts_with(" ") || path_str.trim().is_empty() { return Err(Error::new( ErrorKind::InvalidData, format!("Path expected: {path_str}"), )); } let path = Path::from_escaped_string(path_str.trim()).map_err(|e| { Error::new( ErrorKind::InvalidData, format!("Invalid path {path_str}: {e}"), ) })?; paths.push(path); } Ok(paths) } } impl<R: BufRead + 'static> FallibleIterator for TextReportIterator<R> { type Item = FileGroup<Path>; type Error = std::io::Error; fn next(&mut self) -> Result<Option<Self::Item>, Self::Error> { if self.stopped_on_error { return Ok(None); } match self.read_group_header() { Ok(Some(header)) => { let paths = self.read_paths(header.count)?; Ok(Some(FileGroup { file_len: header.file_len, file_hash: header.file_hash, files: paths, })) } Ok(None) => Ok(None), Err(e) => { self.stopped_on_error = true; Err(e) } } } } /// Reads a text report from a stream. /// /// Currently supports only the default text report format. /// Does not load the whole report into memory. /// Allows iterating over groups of files. pub struct TextReportReader<R: BufRead> { pub stream: R, } impl<R: BufRead> TextReportReader<R> { /// Creates a new reader for reading from the given stream pub fn new(stream: R) -> TextReportReader<R> { TextReportReader { stream } } fn read_line(&mut self) -> io::Result<String> { let mut line_buf = String::new(); self.stream.read_line(&mut line_buf)?; Ok(line_buf) } fn read_extract(&mut self, regex: &Regex, name: &str) -> io::Result<Vec<String>> { let line = self.read_line()?; Ok(regex .captures(line.trim()) .ok_or_else(|| { Error::new( ErrorKind::InvalidData, format!("Malformed header: Missing {name}"), ) })? .iter() .skip(1) .map(|c| c.unwrap().as_str().to_owned()) .collect()) } fn parse_timestamp(value: &str, name: &str) -> io::Result<DateTime<FixedOffset>> { DateTime::parse_from_str(value, TIMESTAMP_FMT).map_err(|e| { Error::new( ErrorKind::InvalidData, format!( "Malformed header: Failed to parse {name}: {e}. Expected timestamp format: {TIMESTAMP_FMT}" ), ) }) } fn parse_u64(value: Option<&String>, name: &str) -> io::Result<u64> { match value { Some(value) => value.parse().map_err(|e| { Error::new( ErrorKind::InvalidData, format!( "Malformed header: Failed to parse {name}: {e}. Expected integer value." ), ) }), None => Err(Error::new( ErrorKind::InvalidData, format!("Malformed header: Missing {name}"), )), } } fn parse_usize(value: Option<&String>, name: &str) -> io::Result<usize> { Ok(Self::parse_u64(value, name)? as usize) } fn parse_file_len(value: Option<&String>, name: &str) -> io::Result<FileLen> { let value = Self::parse_u64(value, name)?; Ok(FileLen(value)) } } impl<R: BufRead + Send + 'static> ReportReader for TextReportReader<R> { fn read_header(&mut self) -> io::Result<ReportHeader> { lazy_static! { static ref VERSION_RE: Regex = Regex::new(r"^# Report by fclones ([0-9]+\.[0-9]+\.[0-9]+)").unwrap(); static ref TIMESTAMP_RE: Regex = Regex::new(r"^# Timestamp: (.*)").unwrap(); static ref COMMAND_RE: Regex = Regex::new(r"^# Command: (.*)").unwrap(); static ref BASE_DIR_RE: Regex = Regex::new(r"^# Base dir: (.*)").unwrap(); static ref TOTAL_RE: Regex = Regex::new(r"^# Total: ([0-9]+) B \([^)]+\) in ([0-9]+) files in ([0-9]+) groups") .unwrap(); static ref REDUNDANT_RE: Regex = Regex::new(r"^# Redundant: ([0-9]+) B \([^)]+\) in ([0-9]+) files").unwrap(); static ref MISSING_RE: Regex = Regex::new(r"^# Missing: ([0-9]+) B \([^)]+\) in ([0-9]+) files").unwrap(); } let version = self .read_extract(&VERSION_RE, "fclones version")? .swap_remove(0); let timestamp = self .read_extract(&TIMESTAMP_RE, "timestamp")? .swap_remove(0); let timestamp = Self::parse_timestamp(×tamp, "timestamp")?; let command = self.read_extract(&COMMAND_RE, "command")?.swap_remove(0); let command = arg::split(&command).map_err(|e| { Error::new( ErrorKind::InvalidData, format!("Malformed header: Failed to parse command arguments: {e}"), ) })?; let base_dir = self.read_extract(&BASE_DIR_RE, "base dir")?.swap_remove(0); let base_dir = Path::from(base_dir); let stats = self.read_extract(&TOTAL_RE, "total file statistics")?; let total_file_size = Self::parse_file_len(stats.get(0), "total file size")?; let total_file_count = Self::parse_usize(stats.get(1), "total file count")?; let group_count = Self::parse_usize(stats.get(2), "group count")?; let stats = self.read_extract(&REDUNDANT_RE, "redundant file statistics")?; let redundant_file_size = Self::parse_file_len(stats.get(0), "redundant file size")?; let redundant_file_count = Self::parse_usize(stats.get(1), "redundant file count")?; let stats = self.read_extract(&MISSING_RE, "missing file statistics")?; let missing_file_size = Self::parse_file_len(stats.get(0), "missing file size")?; let missing_file_count = Self::parse_usize(stats.get(1), "missing file count")?; Ok(ReportHeader { version, timestamp, command, base_dir, stats: Some(FileStats { group_count, total_file_count, total_file_size, redundant_file_count, redundant_file_size, missing_file_count, missing_file_size, }), }) } fn read_groups( self: Box<Self>, ) -> io::Result<Box<dyn FallibleIterator<Item = FileGroup<Path>, Error = Error> + Send>> { Ok(Box::new(TextReportIterator::new(self.stream))) } } /// Reads a report from a JSON file. /// Currently it is not very memory efficient, because limited to reading the whole file and /// deserializing all data into memory. pub struct JsonReportReader { report: DeserializedReport, } impl JsonReportReader { pub fn new<R: Read>(stream: R) -> io::Result<JsonReportReader> { let report: DeserializedReport = serde_json::from_reader(stream).map_err(|e| { Error::new( ErrorKind::InvalidData, format!("Failed to deserialize JSON report: {e}"), ) })?; Ok(JsonReportReader { report }) } } impl ReportReader for JsonReportReader { fn read_header(&mut self) -> io::Result<ReportHeader> { Ok(self.report.header.clone()) } fn read_groups(self: Box<Self>) -> io::Result<Box<GroupIterator>> { let iter = self.report.groups.into_iter().map(|g| { Ok(FileGroup { file_len: g.file_len, file_hash: g.file_hash, files: g .files .iter() .map(|s| { Path::from_escaped_string(s.as_str()).map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, format!("Invalid path {s}: {e}"), ) }) }) .try_collect()?, }) }); let iter = fallible_iterator::convert(iter); Ok(Box::new(iter)) } } /// Returns a `ReportReader` that can read and decode the report from the given stream. /// Automatically detects the type of the report. pub fn open_report(r: impl Read + Send + 'static) -> io::Result<Box<dyn ReportReader>> { let mut buf_reader = BufReader::with_capacity(16 * 1024, r); let preview = buf_reader.fill_buf()?; let preview = String::from_utf8_lossy(preview); if preview.starts_with('{') { Ok(Box::new(JsonReportReader::new(buf_reader)?)) } else if preview.starts_with('#') { Ok(Box::new(TextReportReader::new(buf_reader))) } else { Err(io::Error::new( ErrorKind::InvalidData, format!( "Unknown report format. Supported formats are: {}, {}", OutputFormat::Default, OutputFormat::Json ), )) } } #[cfg(test)] mod test { use std::env::current_dir; use std::ffi::OsString; use tempfile::NamedTempFile; use crate::file::{FileHash, FileLen}; use crate::path::Path; use super::*; fn dummy_report_header() -> ReportHeader { ReportHeader { command: vec![Arg::from("fclones"), Arg::from("find"), Arg::from(".")], base_dir: Path::from(current_dir().unwrap()), version: env!("CARGO_PKG_VERSION").to_owned(), timestamp: DateTime::parse_from_str("2021-08-27 12:11:23.456 +0000", TIMESTAMP_FMT) .unwrap(), stats: Some(FileStats { group_count: 4, total_file_count: 1000, total_file_size: FileLen(2500), redundant_file_count: 234, redundant_file_size: FileLen(1000), missing_file_count: 93, missing_file_size: FileLen(300), }), } } #[test] fn test_text_report_reader_reads_header() { let header1 = dummy_report_header(); let groups: Vec<FileGroup<Path>> = vec![]; let output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); let mut writer = ReportWriter::new(output, false); writer.write_as_text(&header1, groups.into_iter()).unwrap(); let mut reader = TextReportReader::new(BufReader::new(input)); let header2 = reader.read_header().unwrap(); assert_eq!(header2.version, header1.version); assert_eq!(header2.command, header1.command); assert_eq!(header2.timestamp.timestamp(), header1.timestamp.timestamp()); assert_eq!(header2.stats, header1.stats); } fn roundtrip_groups_text(header: &ReportHeader, groups: Vec<FileGroup<Path>>) { let output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); let mut writer = ReportWriter::new(output, false); writer.write_as_text(header, groups.iter()).unwrap(); let mut reader = Box::new(TextReportReader::new(BufReader::new(input))); reader.read_header().unwrap(); let groups2: Vec<_> = reader.read_groups().unwrap().collect().unwrap(); assert_eq!(groups, groups2); } #[test] fn test_text_report_reader_reads_files() { let header = dummy_report_header(); let groups = vec![ FileGroup { file_len: FileLen(100), file_hash: FileHash::from(0x00112233445566778899aabbccddeeff), files: vec![Path::from("a"), Path::from("b")], }, FileGroup { file_len: FileLen(40), file_hash: FileHash::from(0x0000000000000555555555ffffffffff), files: vec![Path::from("c"), Path::from("d")], }, ]; roundtrip_groups_text(&header, groups); } #[test] fn test_text_report_reader_reads_files_with_control_chars_in_names() { let header = dummy_report_header(); let groups = vec![ FileGroup { file_len: FileLen(100), file_hash: FileHash::from(0x00112233445566778899aabbccddeeff), files: vec![Path::from("\t\r\n/foo"), Path::from("Ä…Ä™/ść/żź/óń/")], }, FileGroup { file_len: FileLen(40), file_hash: FileHash::from(0x0000000000000555555555ffffffffff), files: vec![Path::from("c\u{7f}"), Path::from("😀/😋")], }, ]; roundtrip_groups_text(&header, groups); } #[cfg(unix)] #[test] fn test_text_report_reader_reads_files_with_non_utf8_chars_in_names() { use std::os::unix::ffi::OsStringExt; let header = dummy_report_header(); let groups = vec![FileGroup { file_len: FileLen(100), file_hash: FileHash::from(0x00112233445566778899aabbccddeeff), files: vec![Path::from(OsString::from_vec(vec![ 0xED, 0xA0, 0xBD, 0xED, 0xB8, 0x8D, ]))], }]; roundtrip_groups_text(&header, groups); } #[test] fn test_text_report_iterator_stops_on_error() { let mut output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); writeln!(output, "7d6ebf613bf94dfd976d169ff6ae02c3, 4 B (4 B) * 2:").unwrap(); writeln!(output, " /file1").unwrap(); writeln!(output, " /file2").unwrap(); writeln!(output, "malformed group header:").unwrap(); writeln!(output, " /file1").unwrap(); writeln!(output, " /file2").unwrap(); drop(output); let mut group_iterator = TextReportIterator::new(BufReader::new(input)); assert!(group_iterator.next().is_ok()); assert!(group_iterator.next().is_err()); assert!(group_iterator.next().unwrap().is_none()); } #[test] fn test_text_report_iterator_handles_windows_endlines() { let mut output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); write!( output, "7d6ebf613bf94dfd976d169ff6ae02c3, 4 B (4 B) * 2:\r\n" ) .unwrap(); write!(output, " /file1\r\n").unwrap(); write!(output, " /file2\r\n").unwrap(); write!( output, "7d6edf123096e5f4b7fcd002351faccc, 4 B (4 B) * 2:\r\n" ) .unwrap(); write!(output, " /file3\r\n").unwrap(); write!(output, " /file4\r\n").unwrap(); drop(output); let mut group_iterator = TextReportIterator::new(BufReader::new(input)); let g = group_iterator.next().unwrap().unwrap(); assert!(g.files.contains(&Path::from("/file1"))); assert!(g.files.contains(&Path::from("/file2"))); let g = group_iterator.next().unwrap().unwrap(); assert!(g.files.contains(&Path::from("/file3"))); assert!(g.files.contains(&Path::from("/file4"))); } #[test] fn test_json_report_header() { let header1 = dummy_report_header(); let groups: Vec<FileGroup<Path>> = vec![]; let output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); let mut writer = ReportWriter::new(output, false); writer.write_as_json(&header1, groups.into_iter()).unwrap(); let mut reader = JsonReportReader::new(input).unwrap(); let header2 = reader.read_header().unwrap(); assert_eq!(header2.version, header1.version); assert_eq!(header2.command, header1.command); assert_eq!(header2.timestamp.timestamp(), header1.timestamp.timestamp()); assert_eq!(header2.stats, header1.stats); } fn roundtrip_groups_json(header: &ReportHeader, groups: Vec<FileGroup<Path>>) { let output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); let mut writer = ReportWriter::new(output, false); writer.write_as_json(header, groups.iter()).unwrap(); let mut reader = Box::new(JsonReportReader::new(input).unwrap()); reader.read_header().unwrap(); let groups2: Vec<_> = reader.read_groups().unwrap().collect().unwrap(); assert_eq!(groups, groups2); } #[test] fn test_json_report_reader_reads_files() { let header = dummy_report_header(); let groups = vec![ FileGroup { file_len: FileLen(100), file_hash: FileHash::from(0x00112233445566778899aabbccddeeff), files: vec![Path::from("a"), Path::from("b")], }, FileGroup { file_len: FileLen(40), file_hash: FileHash::from(0x0000000000000555555555ffffffffff), files: vec![Path::from("c"), Path::from("d")], }, ]; roundtrip_groups_json(&header, groups); } #[test] fn test_json_report_reader_reads_files_with_control_chars_in_names() { let header = dummy_report_header(); let groups = vec![ FileGroup { file_len: FileLen(100), file_hash: FileHash::from(0x00112233445566778899aabbccddeeff), files: vec![Path::from("\t\r\n/foo"), Path::from("Ä…Ä™/ść/żź/óń/")], }, FileGroup { file_len: FileLen(40), file_hash: FileHash::from(0x0000000000000555555555ffffffffff), files: vec![Path::from("c\u{7f}"), Path::from("😀/😋")], }, ]; roundtrip_groups_json(&header, groups); } #[cfg(unix)] #[test] fn test_json_report_reader_reads_files_with_non_utf8_chars_in_names() { use std::os::unix::ffi::OsStringExt; let header = dummy_report_header(); let groups = vec![FileGroup { file_len: FileLen(100), file_hash: FileHash::from(0x00112233445566778899aabbccddeeff), files: vec![Path::from(OsString::from_vec(vec![ 0xED, 0xA0, 0xBD, 0xED, 0xB8, 0x8D, ]))], }]; roundtrip_groups_json(&header, groups); } fn roundtrip_header(header: &ReportHeader, format: OutputFormat) -> ReportHeader { let groups: Vec<FileGroup<Path>> = vec![]; let output = NamedTempFile::new().unwrap(); let input = output.reopen().unwrap(); let mut writer = ReportWriter::new(output, false); writer.write(format, header, groups.iter()).unwrap(); let mut reader = open_report(input).unwrap(); reader.read_header().unwrap() } #[test] fn test_format_autodetection() { let header = dummy_report_header(); let reread_header_1 = roundtrip_header(&header, OutputFormat::Default); let reread_header_2 = roundtrip_header(&header, OutputFormat::Json); assert_eq!(header, reread_header_1); assert_eq!(header, reread_header_2); } } 0707010000001F000081A4000000000000000000000001653E86C200000705000000000000000000000000000000000000002500000000fclones-0.34.0/fclones/src/rlimit.rs// The non-unix cfg does not use all imports #![allow(unused_imports)] use crate::semaphore::Semaphore; use lazy_static::lazy_static; use std::sync::Arc; #[cfg(unix)] // Get the maximum number of open file descriptors for this process, and if // the hard limit is larger than the soft limit increase it. fn rlimit_nofile() -> libc::rlim_t { let mut file_limit = libc::rlimit { rlim_cur: 0, rlim_max: 0, }; unsafe { if libc::getrlimit(libc::RLIMIT_NOFILE, &mut file_limit) != 0 { return 200; } } if file_limit.rlim_max > file_limit.rlim_cur { let prev = file_limit.rlim_cur; file_limit.rlim_cur = file_limit.rlim_max; unsafe { if libc::setrlimit(libc::RLIMIT_NOFILE, &file_limit) == 0 { file_limit.rlim_max } else { prev } } } else { file_limit.rlim_cur } } #[cfg(unix)] // stdin, stdout, stderr, plus two as a buffer const OTHER_OPEN_FILES: isize = 3 + 2; #[cfg(unix)] lazy_static! { // Globally track the number of opened files so many parallel operations do not raise // "Too many open files (os error 24)". pub static ref RLIMIT_OPEN_FILES: Arc<Semaphore> = Arc::new(Semaphore::new(std::cmp::max( rlimit_nofile() as isize - OTHER_OPEN_FILES, 64 // fallback value ))); } #[cfg(not(unix))] pub mod not_unix { #[derive(Clone, Copy)] pub struct NoRlimit; impl NoRlimit { pub fn new() -> Self { Self {} } pub fn clone(self) -> Self { self } pub fn access_owned(self) -> () {} } } #[cfg(not(unix))] lazy_static! { pub static ref RLIMIT_OPEN_FILES: not_unix::NoRlimit = not_unix::NoRlimit::new(); } 07070100000020000081A4000000000000000000000001653E86C200002526000000000000000000000000000000000000002700000000fclones-0.34.0/fclones/src/selector.rsuse std::path::MAIN_SEPARATOR; use std::sync::Arc; use crate::path::Path; use crate::pattern::Pattern; /// Stores glob patterns working together as a path selector. /// /// A path is selected only if it matches at least one include pattern /// and doesn't match any exclude patterns. /// An empty include pattern vector matches all paths. #[derive(Debug, Clone)] pub struct PathSelector { base_dir: Arc<Path>, included_names: Vec<Pattern>, included_paths: Vec<Pattern>, excluded_paths: Vec<Pattern>, } impl PathSelector { /// Creates a new selector that matches all paths. pub fn new(base_dir: Path) -> PathSelector { PathSelector { base_dir: Arc::new(base_dir), included_names: vec![], included_paths: vec![], excluded_paths: vec![], } } pub fn include_names(mut self, pat: Vec<Pattern>) -> PathSelector { self.included_names = pat; self } pub fn include_paths(mut self, pat: Vec<Pattern>) -> PathSelector { self.included_paths = pat .into_iter() .map(|p| Self::abs_pattern(&self.base_dir, p)) .collect(); self } pub fn exclude_paths(mut self, pat: Vec<Pattern>) -> PathSelector { self.excluded_paths = pat .into_iter() .map(|p| Self::abs_pattern(&self.base_dir, p)) .collect(); self } /// Returns true if the given path fully matches this selector. pub fn matches_full_path(&self, path: &Path) -> bool { self.with_absolute_path(path, |path| { let name = path .file_name_cstr() .map(|s| s.to_string_lossy().to_string()) .unwrap_or_default(); let name = name.as_ref(); let path = path.to_string_lossy(); (self.included_names.is_empty() || self.included_names.iter().any(|p| p.matches(name))) && (self.included_paths.is_empty() || self.included_paths.iter().any(|p| p.matches(&path))) && self.excluded_paths.iter().all(|p| !p.matches(&path)) }) } /// Returns true if the given directory may contain matching paths. /// Used to decide whether the directory walk should descend to that directory. /// The directory should be allowed only if: /// 1. all its components match a prefix of at least one include filter, /// 2. it doesn't match any of the exclude filters ending with `**` pattern. pub fn matches_dir(&self, path: &Path) -> bool { self.with_absolute_path(path, |path| { let mut path = path.to_string_lossy(); if !path.ends_with(MAIN_SEPARATOR) { path.push(MAIN_SEPARATOR); } (self.included_paths.is_empty() || self .included_paths .iter() .any(|p| p.matches_partially(&path))) && self.excluded_paths.iter().all(|p| !p.matches_prefix(&path)) }) } /// Executes given code with a reference to an absolute path. /// If `path` is already absolute, a direct reference is provided and no allocations happen. /// If `path` is relative, it would be appended to the `self.base_path` first and a reference /// to the temporary result will be provided. fn with_absolute_path<F, R>(&self, path: &Path, f: F) -> R where F: Fn(&Path) -> R, { if path.is_absolute() { (f)(path) } else { (f)(&self.base_dir.join(path)) } } /// Returns an absolute pattern. /// If pattern is relative (i.e. does not start with fs root), then the base_dir is appended. fn abs_pattern(base_dir: &Path, pattern: Pattern) -> Pattern { if Self::is_absolute(&pattern) { pattern } else { let base_dir_pat = base_dir.to_string_lossy(); let base_dir_pat = base_dir_pat.replace('\u{FFFD}', "?"); let base_dir_pat = Pattern::literal(Self::append_sep(base_dir_pat).as_str()); base_dir_pat + pattern } } /// Appends path separator if the string doesn't end with one already fn append_sep(s: String) -> String { if s.ends_with(MAIN_SEPARATOR) { s } else { s + MAIN_SEPARATOR.to_string().as_str() } } /// Returns true if pattern can match absolute paths fn is_absolute(pattern: &Pattern) -> bool { let s = pattern.to_string(); s.starts_with(".*") || Path::from(s).is_absolute() } } #[cfg(test)] mod test { use super::*; #[test] fn match_all() { let selector = PathSelector::new(Path::from("/test")); assert!(selector.matches_full_path(&Path::from("/test/foo/bar"))); assert!(selector.matches_full_path(&Path::from("/test/foo/bar/baz"))); assert!(selector.matches_full_path(&Path::from("/test/anything123"))); } #[test] fn include_absolute() { let selector = PathSelector::new(Path::from("/test")) .include_paths(vec![Pattern::glob("/test/foo/**").unwrap()]); assert!(selector.matches_full_path(&Path::from("/test/foo/bar"))); assert!(selector.matches_full_path(&Path::from("/test/foo/bar/baz"))); assert!(!selector.matches_full_path(&Path::from("/test/bar"))); } #[test] fn include_relative() { let selector = PathSelector::new(Path::from("/test")) .include_paths(vec![Pattern::glob("foo/**").unwrap()]); // matching: assert!(selector.matches_full_path(&Path::from("/test/foo/bar"))); assert!(selector.matches_full_path(&Path::from("/test/foo/bar/baz"))); assert!(selector.matches_full_path(&Path::from("foo/bar"))); assert!(selector.matches_full_path(&Path::from("foo/bar/baz"))); // not matching: assert!(!selector.matches_full_path(&Path::from("bar"))); assert!(!selector.matches_full_path(&Path::from("/bar"))); assert!(!selector.matches_full_path(&Path::from("/test/bar"))); } #[test] fn include_relative_root_base() { let selector = PathSelector::new(Path::from("/")) .include_paths(vec![Pattern::glob("foo/**").unwrap()]); // matching: assert!(selector.matches_full_path(&Path::from("/foo/bar"))); assert!(selector.matches_full_path(&Path::from("/foo/bar/baz"))); assert!(selector.matches_full_path(&Path::from("foo/bar"))); assert!(selector.matches_full_path(&Path::from("foo/bar/baz"))); // not matching: assert!(!selector.matches_full_path(&Path::from("bar"))); assert!(!selector.matches_full_path(&Path::from("/bar"))); assert!(!selector.matches_full_path(&Path::from("/test/bar"))); } #[test] fn include_exclude() { let selector = PathSelector::new(Path::from("/")) .include_paths(vec![Pattern::glob("/foo/**").unwrap()]) .exclude_paths(vec![Pattern::glob("/foo/b*/**").unwrap()]); // matching: assert!(selector.matches_full_path(&Path::from("/foo/foo"))); assert!(selector.matches_full_path(&Path::from("/foo/foo/foo"))); // not matching: assert!(!selector.matches_full_path(&Path::from("/foo/bar/baz"))); assert!(!selector.matches_full_path(&Path::from("/test/bar"))); } #[test] fn prefix_wildcard() { let selector = PathSelector::new(Path::from("/")) .include_paths(vec![Pattern::glob("**/public-?.jpg").unwrap()]) .exclude_paths(vec![Pattern::glob("**/private-?.jpg").unwrap()]); println!("{selector:?}"); // matching absolute: assert!(selector.matches_full_path(&Path::from("/public-1.jpg"))); assert!(selector.matches_full_path(&Path::from("/foo/public-2.jpg"))); assert!(selector.matches_full_path(&Path::from("/foo/foo/public-3.jpg"))); // matching relative: assert!(selector.matches_full_path(&Path::from("foo/public-2.jpg"))); assert!(selector.matches_full_path(&Path::from("foo/foo/public-3.jpg"))); // not matching absolute: assert!(!selector.matches_full_path(&Path::from("/something-else.jpg"))); assert!(!selector.matches_full_path(&Path::from("/private-1.jpg"))); assert!(!selector.matches_full_path(&Path::from("/foo/private-2.jpg"))); assert!(!selector.matches_full_path(&Path::from("/foo/foo/private-3.jpg"))); // not matching relative: assert!(!selector.matches_full_path(&Path::from("something-else.jpg"))); assert!(!selector.matches_full_path(&Path::from("private-1.jpg"))); assert!(!selector.matches_full_path(&Path::from("foo/private-2.jpg"))); assert!(!selector.matches_full_path(&Path::from("foo/foo/private-3.jpg"))); } #[test] fn matches_dir() { let selector = PathSelector::new(Path::from("/")) .include_paths(vec![Pattern::glob("/test[1-9]/**").unwrap()]) .exclude_paths(vec![Pattern::glob("/test[1-9]/foo/**").unwrap()]); assert!(selector.matches_dir(&Path::from("/"))); assert!(selector.matches_dir(&Path::from("/test1"))); assert!(selector.matches_dir(&Path::from("/test2/bar"))); assert!(!selector.matches_dir(&Path::from("/test3/foo"))); assert!(!selector.matches_dir(&Path::from("/test3/foo/bar/baz"))); } } 07070100000021000081A4000000000000000000000001653E86C200001882000000000000000000000000000000000000002800000000fclones-0.34.0/fclones/src/semaphore.rs// Copyright 2014 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your // option. This file may not be copied, modified, or distributed // except according to those terms. // // Owned guards added by Piotr KoÅ‚aczkowski use std::ops::Drop; use std::sync::{Arc, Condvar, Mutex}; /// A counting, blocking, semaphore. /// /// Semaphores are a form of atomic counter where access is only granted if the /// counter is a positive value. Each acquisition will block the calling thread /// until the counter is positive, and each release will increment the counter /// and unblock any threads if necessary. /// pub struct Semaphore { lock: Mutex<isize>, cvar: Condvar, } /// An RAII guard which will release a resource acquired from a semaphore when /// dropped. pub struct SemaphoreGuard<'a> { sem: &'a Semaphore, } /// An RAII guard which will release a resource acquired from a semaphore when /// dropped. This guard allows the semaphore to be acquired on a different thread than /// released. pub struct OwnedSemaphoreGuard { sem: Arc<Semaphore>, } #[allow(clippy::mutex_atomic, unused)] impl Semaphore { /// Creates a new semaphore with the initial count specified. /// /// The count specified can be thought of as a number of resources, and a /// call to `acquire` or `access` will block until at least one resource is /// available. It is valid to initialize a semaphore with a negative count. pub fn new(count: isize) -> Semaphore { Semaphore { lock: Mutex::new(count), cvar: Condvar::new(), } } /// Acquires a resource of this semaphore, blocking the current thread until /// it can do so. /// /// This method will block until the internal count of the semaphore is at /// least 1. pub fn acquire(&self) { let mut count = self.lock.lock().unwrap(); while *count <= 0 { count = self.cvar.wait(count).unwrap(); } *count -= 1; } /// Release a resource from this semaphore. /// /// This will increment the number of resources in this semaphore by 1 and /// will notify any pending waiters in `acquire` or `access` if necessary. pub fn release(&self) { *self.lock.lock().unwrap() += 1; self.cvar.notify_one(); } /// Acquires a resource of this semaphore, returning an RAII guard to /// release the semaphore when dropped. /// /// This function is semantically equivalent to an `acquire` followed by a /// `release` when the guard returned is dropped. pub fn access(&self) -> SemaphoreGuard<'_> { self.acquire(); SemaphoreGuard { sem: self } } /// Acquires a resource of this semaphore, returning an RAII guard to /// release the semaphore when dropped. Allows moving the guard to a different /// thread. /// /// This function is semantically equivalent to an `acquire` followed by a /// `release` when the guard returned is dropped. pub fn access_owned(self: Arc<Self>) -> OwnedSemaphoreGuard { self.acquire(); OwnedSemaphoreGuard { sem: self } } } impl<'a> Drop for SemaphoreGuard<'a> { fn drop(&mut self) { self.sem.release(); } } impl Drop for OwnedSemaphoreGuard { fn drop(&mut self) { self.sem.release(); } } #[cfg(test)] mod tests { use std::prelude::v1::*; use super::Semaphore; use std::sync::mpsc::channel; use std::sync::Arc; use std::thread; #[test] fn test_sem_acquire_release() { let s = Semaphore::new(1); s.acquire(); s.release(); s.acquire(); } #[test] fn test_sem_basic() { let s = Semaphore::new(1); let _g = s.access(); } #[test] fn test_sem_as_mutex() { let s = Arc::new(Semaphore::new(1)); let s2 = s.clone(); let _t = thread::spawn(move || { let _g = s2.access(); }); let _g = s.access(); } #[test] fn test_sem_as_mutex_owned_guard() { let s = Arc::new(Semaphore::new(1)); let g = s.clone().access_owned(); let _t = thread::spawn(move || { drop(g); }); let _g = s.access(); } #[test] fn test_sem_as_cvar() { // Child waits and parent signals let (tx, rx) = channel(); let s = Arc::new(Semaphore::new(0)); let s2 = s.clone(); let _t = thread::spawn(move || { s2.acquire(); tx.send(()).unwrap(); }); s.release(); let _ = rx.recv(); // Parent waits and child signals let (tx, rx) = channel(); let s = Arc::new(Semaphore::new(0)); let s2 = s.clone(); let _t = thread::spawn(move || { s2.release(); let _ = rx.recv(); }); s.acquire(); tx.send(()).unwrap(); } #[test] fn test_sem_multi_resource() { // Parent and child both get in the critical section at the same // time, and shake hands. let s = Arc::new(Semaphore::new(2)); let s2 = s.clone(); let (tx1, rx1) = channel(); let (tx2, rx2) = channel(); let _t = thread::spawn(move || { let _g = s2.access(); let _ = rx2.recv(); tx1.send(()).unwrap(); }); let _g = s.access(); tx2.send(()).unwrap(); rx1.recv().unwrap(); } #[test] fn test_sem_runtime_friendly_blocking() { let s = Arc::new(Semaphore::new(1)); let s2 = s.clone(); let (tx, rx) = channel(); { let _g = s.access(); thread::spawn(move || { tx.send(()).unwrap(); drop(s2.access()); tx.send(()).unwrap(); }); rx.recv().unwrap(); // wait for child to come alive } rx.recv().unwrap(); // wait for child to be done } } 07070100000022000081A4000000000000000000000001653E86C200004610000000000000000000000000000000000000002800000000fclones-0.34.0/fclones/src/transform.rsuse std::cell::RefCell; use std::ffi::OsString; use std::fs::{create_dir_all, remove_dir_all, File, OpenOptions}; use std::io; use std::io::Read; use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::sync::{Arc, Mutex}; use std::thread::JoinHandle; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{none_of, one_of}; use nom::combinator::map; use nom::error::{ErrorKind, ParseError}; use nom::multi::{many1, separated_list0}; use nom::sequence::tuple; use nom::IResult; use regex::Regex; use uuid::Uuid; use crate::path::Path; /// Controls how we pass data to the child process. /// By default, the file to process is sent to the standard input of the child process. /// Some programs do not accept reading input from the stdin, but prefer to be pointed /// to a file by a command-line option - in this case `Named` variant is used. enum Input { /// Pipe the input file from the given path to the stdin of the child StdIn(PathBuf), /// Pass the original path to the file as $IN param Named(PathBuf), /// Copy the original file to a temporary location and pass it as $IN param Copied(PathBuf, PathBuf), } impl Input { fn input_path(&self) -> &PathBuf { match self { Input::StdIn(path) => path, Input::Named(path) => path, Input::Copied(_src, target) => target, } } fn prepare_input_file(&self) -> io::Result<()> { match self { Input::StdIn(_path) => Ok(()), Input::Named(_path) => Ok(()), Input::Copied(src, target) => { std::fs::copy(src, target)?; Ok(()) } } } } impl Drop for Input { /// Removes the temporary file if it was created fn drop(&mut self) { let _ = match self { Input::StdIn(_) => Ok(()), Input::Named(_) => Ok(()), Input::Copied(_, target) => std::fs::remove_file(target), }; } } /// Controls how we read data out from the child process. /// By default we read output directly from the standard output of the child process. /// If the preprocessor program can't output data to its stdout, but supports only writing /// to files, it can be configured to write to a named pipe, and we read from that named pipe. enum Output { /// Pipe data directly to StdOut StdOut, /// Send data through a named pipe Named(PathBuf), /// Read data from the same file as the input InPlace(PathBuf), } impl Output { /// Returns the path to the named pipe if the process is configured to write to a pipe. /// Returns None if the process is configured to write to stdout or to modify the input file. pub fn pipe_path(&self) -> Option<PathBuf> { match &self { Output::Named(output) => Some(output.clone()), _ => None, } } } impl Drop for Output { /// Removes the output file if it was created fn drop(&mut self) { let _ = match self { Output::StdOut => Ok(()), Output::Named(target) => std::fs::remove_file(target), Output::InPlace(target) => std::fs::remove_file(target), }; } } /// Transforms files through an external program. /// The `command_str` field contains a path to a program and its space separated arguments. /// The command takes a file given in the `$IN` variable and produces an `$OUT` file. #[derive(Clone)] pub struct Transform { /// a path to a program and its space separated arguments pub command_str: String, /// temporary directory for storing files and named pipes pub tmp_dir: PathBuf, /// copy the file into temporary directory before running the transform on it pub copy: bool, /// read output from the same location as the original pub in_place: bool, /// will be set to the name of the program, extracted from the command_str pub program: String, } impl Transform { pub fn new(command_str: String, in_place: bool) -> io::Result<Transform> { let has_in = RefCell::new(false); let has_out = RefCell::new(false); let parsed = parse_command(&command_str, |s: &str| { match s { "OUT" if cfg!(windows) => *has_out.borrow_mut() = true, "IN" => *has_in.borrow_mut() = true, _ => {} }; OsString::from(s) }); let has_in = has_in.into_inner(); let has_out = has_out.into_inner(); if cfg!(windows) && has_out { return Err(io::Error::new( io::ErrorKind::Other, "$OUT not supported on Windows yet", )); } if in_place && has_out { return Err(io::Error::new( io::ErrorKind::Other, "$OUT conflicts with --in-place", )); } if in_place && !has_in { return Err(io::Error::new( io::ErrorKind::Other, "$IN required with --in-place", )); } let program = parsed .first() .and_then(|p| PathBuf::from(p).file_name().map(|s| s.to_os_string())); let program = match program { Some(p) => p.into_string().unwrap(), None => { return Err(io::Error::new( io::ErrorKind::Other, "Command cannot be empty", )) } }; // Check if the program is runnable, fail fast if it is not. match Command::new(&program).spawn() { Ok(mut child) => { let _ignore = child.kill(); } Err(e) => { return Err(io::Error::new( e.kind(), format!("Cannot launch {program}: {e}"), )) } } Ok(Transform { command_str, program, tmp_dir: Transform::create_temp_dir()?, copy: has_in, in_place, }) } /// Creates the directory where preprocessed files will be stored fn create_temp_dir() -> io::Result<PathBuf> { let tmp = std::env::temp_dir().join(format!("fclones-{:032x}", Uuid::new_v4().as_u128())); match create_dir_all(&tmp) { Ok(()) => Ok(tmp), Err(e) => Err(io::Error::new( e.kind(), format!( "Failed to create temporary directory {}: {}", tmp.display(), e ), )), } } /// Creates a new unique random file name in the temporary directory fn random_tmp_file_name(&self) -> PathBuf { self.tmp_dir .join(format!("{:032x}", Uuid::new_v4().as_u128())) } /// Returns the output file path for the given input file path pub fn output(&self, input: &Path) -> PathBuf { self.tmp_dir.join(format!("{:x}", input.hash128())) } /// Processes the input file and returns its output and err as stream pub fn run(&self, input: &Path) -> io::Result<Execution> { let (args, input_conf, output_conf) = self.make_args(input); let mut command = build_command(&args, &input_conf, &output_conf)?; let result = execute(&mut command, input_conf, output_conf)?; Ok(result) } /// Creates arguments, input and output configuration for processing given input path. /// The first element of the argument vector contains the program name. fn make_args(&self, input: &Path) -> (Vec<OsString>, Input, Output) { let input_conf = RefCell::<Input>::new(Input::StdIn(input.to_path_buf())); let output_conf = RefCell::<Output>::new(Output::StdOut); let args = parse_command(self.command_str.as_str(), |arg| match arg { "IN" if self.copy => { let tmp_target = self.random_tmp_file_name(); input_conf.replace(Input::Copied(input.to_path_buf(), tmp_target.clone())); tmp_target.into_os_string() } "IN" => { let input = input.to_path_buf(); input_conf.replace(Input::Named(input.clone())); input.into_os_string() } "OUT" => { let output = self.output(input); output_conf.replace(Output::Named(output.clone())); output.into_os_string() } _ => OsString::from(arg), }); let input_conf = input_conf.into_inner(); let mut output_conf = output_conf.into_inner(); if self.in_place { output_conf = Output::InPlace(input_conf.input_path().clone()) } (args, input_conf, output_conf) } } /// Cleans up temporary files impl Drop for Transform { fn drop(&mut self) { let _ = remove_dir_all(&self.tmp_dir); } } /// Keeps the results of the transform program execution pub struct Execution { pub(crate) child: Arc<Mutex<Child>>, pub(crate) out_stream: Box<dyn Read>, pub(crate) err_stream: Option<JoinHandle<String>>, _input: Input, // holds the temporary input file(s) until execution is done _output: Output, // holds the temporary output file(s) until execution is done } impl Drop for Execution { fn drop(&mut self) { let mut buf = [0; 4096]; while let Ok(1..) = self.out_stream.read(&mut buf) {} let _ = self.child.lock().unwrap().wait(); } } /// Builds the `Command` struct from the parsed arguments fn build_command( args: &[OsString], input_conf: &Input, output_conf: &Output, ) -> io::Result<Command> { let mut args = args.iter(); let mut command = Command::new(args.next().unwrap()); command.args(args); command.stderr(Stdio::piped()); input_conf.prepare_input_file()?; if let Input::StdIn(_) = input_conf { command.stdin(File::open(input_conf.input_path())?); } else { command.stdin(Stdio::null()); } if let Output::Named(output) = output_conf { command.stdout(Stdio::null()); create_named_pipe(output)?; } else { command.stdout(Stdio::piped()); } Ok(command) } #[cfg(unix)] fn create_named_pipe(path: &std::path::Path) -> io::Result<()> { use nix::sys::stat; use nix::unistd::mkfifo; if let Err(e) = mkfifo(path, stat::Mode::S_IRWXU) { let io_err: io::Error = e.into(); return Err(io::Error::new( io_err.kind(), format!("Failed to create named pipe {}: {}", path.display(), io_err), )); } Ok(()) } #[cfg(windows)] fn create_named_pipe(_path: &PathBuf) -> io::Result<()> { unimplemented!() } /// Spawns the command process, and returns its output as a stream. /// The standard error is captured by a background thread and read to a string. fn execute(command: &mut Command, input: Input, output: Output) -> io::Result<Execution> { let child = Arc::new(Mutex::new(command.spawn()?)); // We call 'take' to avoid borrowing `child` for longer than a single line. // We can't reference stdout/stderr directly, because a mutable borrow of a field // creates a mutable borrow of the containing struct, but later we need to mutably // borrow `child` again to wait on it. let child_out = child.lock().unwrap().stdout.take(); let child_err = child.lock().unwrap().stderr.take(); let output_pipe = output.pipe_path(); let child_ref = child.clone(); // Capture the stderr in background in order to avoid a deadlock when the child process // would block on writing to stdout, and this process would block on reading stderr // (or the other way round). // The other solution could be to use non-blocking I/O, but threads look simpler. let stderr_reaper = std::thread::spawn(move || { let mut str = String::new(); if let Some(mut stream) = child_err { let _ = stream.read_to_string(&mut str); } // If the child is supposed to communicate its output through a named pipe, // ensure the pipe gets closed and the reader at the other end receives an EOF. // It is possible that due to a misconfiguration // (e.g. wrong arguments given by the user) the child would never open the output file // and the reader at the other end would block forever. if let Some(output_pipe) = output_pipe { // If those fail, we have no way to report the failure. // However if waiting fails here, the child process likely doesn't run, so that's not // a problem. let _ignore = child_ref.lock().unwrap().wait(); let _ignore = OpenOptions::new().write(true).open(output_pipe); } str }); let child_out: Box<dyn Read> = match &output { Output::StdOut => Box::new(child_out.unwrap()), Output::Named(output) => Box::new(File::open(output)?), Output::InPlace(output) => { child.lock().unwrap().wait()?; Box::new(File::open(output)?) } }; Ok(Execution { child, out_stream: child_out, err_stream: Some(stderr_reaper), _input: input, _output: output, }) } /// Compares the input with a regular expression and returns the first match. /// Backported from nom 6.0.0-alpha1. We can't use nom 6.0.0-alpha1 directly, /// because it had some issues with our use of functions in pattern.rs. fn re_find<'s, E>(re: Regex) -> impl Fn(&'s str) -> IResult<&'s str, &'s str, E> where E: ParseError<&'s str>, { move |i| { if let Some(m) = re.find(i) { Ok((&i[m.end()..], &i[m.start()..m.end()])) } else { Err(nom::Err::Error(E::from_error_kind( i, ErrorKind::RegexpMatch, ))) } } } /// Splits the command string into separate arguments and substitutes $params fn parse_command<F>(command: &str, substitute: F) -> Vec<OsString> where F: Fn(&str) -> OsString, { fn join_chars(chars: Vec<char>) -> OsString { let mut result = OsString::new(); for c in chars { result.push(c.to_string()) } result } fn join_str(strings: Vec<OsString>) -> OsString { let mut result = OsString::new(); for c in strings { result.push(c) } result } let r_var = Regex::new(r"^([[:alnum:]]|_)+").unwrap(); let p_var = map(tuple((tag("$"), re_find(r_var))), |(_, str)| { (substitute)(str) }); let p_non_var = map(many1(none_of(" $")), join_chars); let p_arg = map(many1(alt((p_var, p_non_var))), join_str); let p_whitespace = many1(one_of(" \t")); let p_args = |s| separated_list0(p_whitespace, p_arg)(s); let result: IResult<&str, Vec<OsString>> = (p_args)(command); result.expect("Parse error").1 } #[cfg(test)] mod test { use std::io::Write; use crate::file::{FileChunk, FileLen, FilePos}; use crate::hasher::{FileHasher, HashFn}; use crate::log::StdLog; use crate::util::test::with_dir; use super::*; #[test] fn empty() { assert!(Transform::new(String::from(" "), false).is_err()); } #[test] #[cfg(unix)] fn piped() { with_dir("target/test/transform/piped/", |root| { let transform = Transform::new(String::from("dd"), false).unwrap(); let input_path = root.join("input.txt"); let mut input = File::create(&input_path).unwrap(); let content = b"content"; input.write_all(content).unwrap(); drop(input); let log = StdLog::default(); let hasher = FileHasher::new(HashFn::default(), Some(transform), &log); let input_path = Path::from(input_path); let chunk = FileChunk::new(&input_path, FilePos(0), FileLen::MAX); let good_file_hash = hasher.hash_file(&chunk, |_| {}).unwrap(); let result = hasher.hash_transformed(&chunk, |_| {}).unwrap(); assert_eq!(result.0, FileLen(content.len() as u64)); assert_eq!(result.1, good_file_hash); }) } #[test] #[cfg(unix)] fn parameterized() { with_dir("target/test/transform/param/", |root| { let transform = Transform::new(String::from("dd if=$IN of=$OUT"), false).unwrap(); let input_path = root.join("input.txt"); let mut input = File::create(&input_path).unwrap(); let content = b"content"; input.write_all(content).unwrap(); drop(input); let log = StdLog::default(); let hasher = FileHasher::new(HashFn::default(), Some(transform), &log); let input_path = Path::from(input_path); let chunk = FileChunk::new(&input_path, FilePos(0), FileLen::MAX); let good_file_hash = hasher.hash_file(&chunk, |_| {}).unwrap(); let result = hasher.hash_transformed(&chunk, |_| {}).unwrap(); assert_eq!(result.0, FileLen(content.len() as u64)); assert_eq!(result.1, good_file_hash); }) } #[test] fn parse_command() { let result = super::parse_command("foo bar", |s| OsString::from(s)); assert_eq!(result, vec![OsString::from("foo"), OsString::from("bar")]) } #[test] fn parse_command_substitute() { let result = super::parse_command("foo bar in=$IN", |s| match s { "IN" => OsString::from("/input"), _ => OsString::from(s), }); assert_eq!( result, vec![ OsString::from("foo"), OsString::from("bar"), OsString::from("in=/input") ] ) } } 07070100000023000081A4000000000000000000000001653E86C200001AFF000000000000000000000000000000000000002300000000fclones-0.34.0/fclones/src/util.rsuse itertools::Itertools; use serde::{Serialize, Serializer}; use std::cell::Cell; /// Allows for serializing iterators pub struct IteratorWrapper<T>(pub Cell<Option<T>>); impl<I, P> Serialize for IteratorWrapper<I> where I: IntoIterator<Item = P>, P: Serialize, { fn serialize<S: Serializer>(&self, s: S) -> Result<S::Ok, S::Error> { s.collect_seq(self.0.take().unwrap()) } } /// Sorts an array using a key generation function that can fail. /// Items for which the key could not be obtained are sorted last. /// Returns vector of errors encountered when obtaining the keys. pub fn try_sort_by_key<T, K, E>(v: &mut [T], f: impl Fn(&T) -> Result<K, E>) -> Vec<E> where K: Ord, { let mut errors: Vec<E> = Vec::new(); v.sort_by_key(|t| f(t).map_err(|e| errors.push(e)).ok()); errors } /// Reduces the elements to a single one, by repeatedly applying a reducing operation. /// If the iterator is empty, returns `Ok(None)`; otherwise, returns the result of the reduction. /// If any of the elements is `Err`, returns the first `Err`. pub fn reduce_results<I, T, E, F>(mut iter: I, f: F) -> Result<Option<T>, E> where I: Iterator<Item = Result<T, E>>, F: Fn(T, T) -> T, { iter.fold_ok(None, |res, item| match res { Some(res) => Some(f(res, item)), None => Some(item), }) } /// Finds the minimum value. /// If any of the values is `Err`, returns the first `Err`. /// If the input iterator is empty, returns `Ok(None)`. pub fn min_result<I, T, E>(iter: I) -> Result<Option<T>, E> where I: Iterator<Item = Result<T, E>>, T: Ord, { reduce_results(iter, T::min) } /// Finds the maximum value. /// If any of the values is `Err`, returns the first `Err`. /// If the input iterator is empty, returns `Ok(None)`. pub fn max_result<I, T, E>(iter: I) -> Result<Option<T>, E> where I: Iterator<Item = Result<T, E>>, T: Ord, { reduce_results(iter, T::max) } /// Utility functions intended for testing. /// Beware they typically panic instead of returning `Err`. #[cfg(test)] pub mod test { use std::fs::{create_dir_all, remove_dir_all, File}; use std::io::{BufReader, Read, Write}; use std::path::PathBuf; use std::sync::Mutex; use std::time::SystemTime; use std::{fs, thread}; use super::*; use lazy_static::lazy_static; #[derive(Debug, PartialEq, Eq)] enum FsSupportsReflink { Untested, Supported(bool), } lazy_static! { static ref REFLINK_SUPPORTED: Mutex<FsSupportsReflink> = Mutex::new(FsSupportsReflink::Untested); } /// Runs test code that needs access to temporary file storage. /// Makes sure the test root directory exists and is empty. /// Returns the return value of the test code and recursively deletes /// the directory after the test, unless the test fails. pub fn with_dir<F, R>(test_root: &str, test_code: F) -> R where F: FnOnce(&PathBuf) -> R, { let test_root = PathBuf::from("target/test").join(test_root); // Quick sanity check: Joining a relative with an absolute path // returns an absolute path. if test_root.is_absolute() && !test_root.starts_with("/dev/shm/") { panic!("Internal test error: only use relative paths!"); } remove_dir_all(&test_root).ok(); create_dir_all(&test_root).unwrap(); let ret = test_code(&test_root.canonicalize().unwrap()); remove_dir_all(&test_root).ok(); ret } /// Creates a new empty file. /// If the file existed before, it will be first removed so that the creation time /// is updated. pub fn create_file(path: &std::path::Path) { let _ = fs::remove_file(path); File::create(path).unwrap(); } /// Creates a new empty file with creation time after (not equal) the given time. /// /// This function is used to create multiple files differing by creation time. /// It adapts to the operating system timer resolution. /// /// Returns the creation time of the newly created file. /// Panics if `time` is in future or if file could not be created. pub fn create_file_newer_than(f: &PathBuf, time: SystemTime) -> SystemTime { assert!(SystemTime::now() >= time); let mut delay = std::time::Duration::from_millis(1); loop { thread::sleep(delay); create_file(f); let ctime = fs::metadata(f).unwrap().modified().unwrap(); if ctime != time { return ctime; } delay *= 2; } } /// Writes contents to a new file. Overwrites file if it exists. /// Panics on errors. pub fn write_file(path: &std::path::Path, content: &str) { let mut f = File::create(path).unwrap(); write!(&mut f, "{content}").unwrap(); } /// Reads contents of a file to a string. /// Panics on errors. pub fn read_file(path: &std::path::Path) -> String { let f = File::open(path).unwrap(); let mut r = BufReader::new(f); let mut result = String::new(); r.read_to_string(&mut result).unwrap(); result } pub fn cached_reflink_supported() -> bool { let mut guard = REFLINK_SUPPORTED.lock().unwrap(); match *guard { FsSupportsReflink::Untested => { with_dir("fs_supports_reflink", |test_dir| { let src_file = test_dir.join("src_file"); let dest_file = test_dir.join("dest_file"); write_file(&src_file, "1"); let result = reflink::reflink(src_file, dest_file).is_ok(); *guard = FsSupportsReflink::Supported(result); if !result { println!(" Notice: filesystem does not support reflinks, skipping related tests") } result }) } FsSupportsReflink::Supported(val) => val, } } #[test] fn min_result_should_return_none_if_no_elements() { let elements: Vec<Result<i64, &str>> = vec![]; assert_eq!(min_result(elements.into_iter()), Ok(None)); } #[test] fn min_result_should_return_min_if_all_ok() { let elements: Vec<Result<i64, &str>> = vec![Ok(1), Ok(3), Ok(2)]; assert_eq!(min_result(elements.into_iter()), Ok(Some(1))); } #[test] fn min_result_should_return_err_if_at_least_one_err() { let elements: Vec<Result<i64, &str>> = vec![Ok(1), Ok(3), Err("error"), Ok(2)]; assert_eq!(min_result(elements.into_iter()), Err("error")); } #[test] fn max_result_should_return_max_if_all_ok() { let elements: Vec<Result<i64, &str>> = vec![Ok(1), Ok(3), Ok(2)]; assert_eq!(max_result(elements.into_iter()), Ok(Some(3))); } } 07070100000024000081A4000000000000000000000001653E86C200005A3B000000000000000000000000000000000000002300000000fclones-0.34.0/fclones/src/walk.rsuse std::default::Default; use std::env::current_dir; use std::fs::{read_link, symlink_metadata, DirEntry, FileType, ReadDir}; use std::sync::Arc; use std::{fs, io}; use crate::FileId; use dashmap::DashSet; use ignore::gitignore::{Gitignore, GitignoreBuilder}; use rayon::Scope; use crate::log::{Log, LogExt}; use crate::path::Path; use crate::selector::PathSelector; #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum EntryType { File, Dir, SymLink, Other, } impl EntryType { pub fn from(file_type: FileType) -> EntryType { if file_type.is_symlink() { EntryType::SymLink } else if file_type.is_file() { EntryType::File } else if file_type.is_dir() { EntryType::Dir } else { EntryType::Other } } } /// A path to a file, directory or symbolic link. /// Provides an abstraction over `Path` and `DirEntry` #[derive(Debug)] struct Entry { tpe: EntryType, path: Path, } impl Entry { pub fn new(file_type: FileType, path: Path) -> Entry { Entry { tpe: EntryType::from(file_type), path, } } pub fn from_path(path: Path) -> io::Result<Entry> { symlink_metadata(path.to_path_buf()).map(|meta| Entry::new(meta.file_type(), path)) } pub fn from_dir_entry(base: &Arc<Path>, dir_entry: DirEntry) -> io::Result<Entry> { let path = base.join(Path::from(dir_entry.file_name())); dir_entry.file_type().map(|ft| Entry::new(ft, path)) } } #[derive(Clone)] struct IgnoreStack(Arc<Vec<Gitignore>>); impl IgnoreStack { /// Returns ignore stack initialized with global gitignore settings. fn new(log: Option<&dyn Log>) -> Self { let gitignore = GitignoreBuilder::new("/").build_global(); if let Some(err) = gitignore.1 { if let Some(log) = log { log.warn(format!("Error loading global gitignore rules: {err}")) } } IgnoreStack(Arc::new(vec![gitignore.0])) } /// Returns an empty gitignore stack that ignores no files. pub fn empty() -> IgnoreStack { IgnoreStack(Arc::new(vec![])) } /// If .gitignore file exists in given dir, creates a `Gitignore` struct for it /// and returns the stack with the new `Gitignore` appended. Otherwise returns a cloned self. pub fn push(&self, dir: &Path, log: Option<&dyn Log>) -> IgnoreStack { let mut path = Arc::new(dir.clone()).resolve(Path::from(".gitignore")); let mut path_buf = path.to_path_buf(); if !path_buf.is_file() { path = Arc::new(dir.clone()).resolve(Path::from(".fdignore")); path_buf = path.to_path_buf(); } if !path_buf.is_file() { return self.clone(); } let gitignore = Gitignore::new(&path_buf); if let Some(err) = gitignore.1 { if let Some(log) = log { log.warn(format!( "Error while loading ignore file {}: {}", path.display(), err )) } } let mut stack = self.0.as_ref().clone(); stack.push(gitignore.0); IgnoreStack(Arc::new(stack)) } /// Returns true if any of the gitignore files in the stack selects given path pub fn matches(&self, path: &Path, is_dir: bool) -> bool { // this is on critical performance path, so avoid unnecessary to_path_buf conversion if self.0.is_empty() { return false; } let path = path.to_path_buf(); self.0 .iter() .any(|gitignore| gitignore.matched(&path, is_dir).is_ignore()) } } type DeviceId = u64; /// Describes walk configuration. /// Many walks can be initiated from the same instance. pub struct Walk<'a> { /// Relative root paths are resolved against this dir. pub base_dir: Arc<Path>, /// Maximum allowed directory nesting level. 0 means "do not descend to directories at all". pub depth: usize, /// Include hidden files. pub hidden: bool, /// Resolve symlinks to dirs and files. /// For a symlink to a file, report the target file, unless `report_links` is true. pub follow_links: bool, /// Don't follow symlinks to files, but report them. pub report_links: bool, /// Don't honor .gitignore and .fdignore. pub no_ignore: bool, /// Don't leave the fs of the root paths. pub one_fs: bool, /// Controls selecting or ignoring files by matching file and path names with regexes / globs. pub path_selector: PathSelector, /// The function to call for each visited file. The directories are not reported. pub on_visit: &'a (dyn Fn(&Path) + Sync + Send), /// Warnings about inaccessible files or dirs are logged here, if defined. pub log: Option<&'a dyn Log>, } /// Private shared state scoped to a single `run` invocation. struct WalkState<F> { pub consumer: F, pub visited: DashSet<u128>, } impl<'a> Walk<'a> { /// Creates a default walk with empty root dirs, no link following and logger set to sdterr pub fn new() -> Walk<'a> { let base_dir = Path::from(¤t_dir().unwrap_or_default()); Walk { base_dir: Arc::new(base_dir.clone()), depth: usize::MAX, hidden: false, follow_links: false, report_links: false, no_ignore: false, one_fs: false, path_selector: PathSelector::new(base_dir), on_visit: &|_| {}, log: None, } } /// Walks multiple directories recursively in parallel and sends found files to `consumer`. /// Input paths can be relative to the current working directory, /// but produced paths are absolute. /// The `consumer` must be able to receive items from many threads. /// Inaccessible files are skipped, but errors are printed to stderr. /// The order of files is not specified and may be different every time. pub fn run<I, F>(&self, roots: I, consumer: F) where I: IntoIterator<Item = Path> + Send, F: Fn(Path) + Sync + Send, { let state = WalkState { consumer, visited: DashSet::new(), }; rayon::scope(|scope| { let ignore = if self.no_ignore { IgnoreStack::empty() } else { IgnoreStack::new(self.log) }; for p in roots.into_iter() { let p = self.absolute(p); let ignore = ignore.clone(); match fs::metadata(&p.to_path_buf()) { Ok(metadata) if metadata.is_dir() && self.depth == 0 => self.log_warn(format!( "Skipping directory {} because recursive scan is disabled.", p.display() )), #[cfg(unix)] Ok(metadata) => { let dev = FileId::from_metadata(&metadata).device; let state = &state; scope.spawn(move |scope| self.visit_path(p, dev, scope, 0, ignore, state)) } #[cfg(windows)] Ok(_) => { let dev = FileId::new(&p).map(|f| f.device); match dev { Err(err) if self.one_fs => self.log_warn(format!( "Failed to get device information for {}: {}", p.display(), err )), _ => { let dev = dev.unwrap_or_default(); let state = &state; scope.spawn(move |scope| { self.visit_path(p, dev, scope, 0, ignore, state) }) } } } Err(err) => { self.log_warn(format!("Cannot stat {}: {}", p.display(), err)); } } } }); } /// Visits path of any type (can be a symlink target, file or dir) fn visit_path<'s, 'w, F>( &'s self, path: Path, dev: DeviceId, scope: &Scope<'w>, level: usize, gitignore: IgnoreStack, state: &'w WalkState<F>, ) where F: Fn(Path) + Sync + Send, 's: 'w, { if self.path_selector.matches_dir(&path) { Entry::from_path(path.clone()) .map_err(|e| self.log_warn(format!("Failed to stat {}: {}", path.display(), e))) .into_iter() .for_each(|entry| { self.visit_entry(entry, dev, scope, level, gitignore.clone(), state) }) } } /// Visits a path that was already converted to an `Entry` so the entry type is known. /// Faster than `visit_path` because it doesn't need to call `stat` internally. fn visit_entry<'s, 'w, F>( &'s self, entry: Entry, dev: DeviceId, scope: &Scope<'w>, level: usize, gitignore: IgnoreStack, state: &'w WalkState<F>, ) where F: Fn(Path) + Sync + Send, 's: 'w, { // For progress reporting (self.on_visit)(&entry.path); // Skip hidden files if !self.hidden { if let Some(name) = entry.path.file_name_cstr() { if name.to_string_lossy().starts_with('.') { return; } } } // Skip already visited paths. We're checking only when follow_links is true, // because inserting into a shared hash set is costly. if self.follow_links && !state.visited.insert(entry.path.hash128()) { return; } // Skip entries ignored by .gitignore if !self.no_ignore && gitignore.matches(&entry.path, entry.tpe == EntryType::Dir) { return; } match entry.tpe { EntryType::File => self.visit_file(entry.path, state), EntryType::Dir => self.visit_dir(entry.path, dev, scope, level, gitignore, state), EntryType::SymLink => self.visit_link(entry.path, dev, scope, level, gitignore, state), EntryType::Other => {} } } /// If file matches selection criteria, sends it to the consumer fn visit_file<F>(&self, path: Path, state: &WalkState<F>) where F: Fn(Path) + Sync + Send, { if self.path_selector.matches_full_path(&path) { (state.consumer)(path) } } /// Resolves a symbolic link. /// If `follow_links` is set to false, does nothing. fn visit_link<'s, 'w, F>( &'s self, path: Path, dev: DeviceId, scope: &Scope<'w>, level: usize, gitignore: IgnoreStack, state: &'w WalkState<F>, ) where F: Fn(Path) + Sync + Send, 's: 'w, { if self.follow_links || self.report_links { match self.resolve_link(&path) { Ok((_, EntryType::File)) if self.report_links => self.visit_file(path, state), Ok((target, _)) => { if self.follow_links && (!self.one_fs || self.same_fs(&target, dev)) { self.visit_path(target, dev, scope, level, gitignore, state); } } Err(e) => self.log_warn(format!("Failed to read link {}: {}", path.display(), e)), } } } /// Reads the contents of the directory pointed to by `path` /// and recursively visits each child entry fn visit_dir<'s, 'w, F>( &'s self, path: Path, dev: DeviceId, scope: &Scope<'w>, level: usize, gitignore: IgnoreStack, state: &'w WalkState<F>, ) where F: Fn(Path) + Sync + Send, 's: 'w, { if level > self.depth { return; } if !self.path_selector.matches_dir(&path) { return; } if self.one_fs && !self.same_fs(&path, dev) { return; } let gitignore = if self.no_ignore { gitignore } else { gitignore.push(&path, self.log) }; match fs::read_dir(path.to_path_buf()) { Ok(rd) => { for entry in Self::sorted_entries(path, rd) { let gitignore = gitignore.clone(); scope.spawn(move |s| { self.visit_entry(entry, dev, s, level + 1, gitignore, state) }) } } Err(e) => self.log_warn(format!("Failed to read dir {}: {}", path.display(), e)), } } #[cfg(unix)] fn sort_dir_entries_by_inode(entries: &mut Vec<DirEntry>) { use rayon::prelude::ParallelSliceMut; use std::os::unix::fs::DirEntryExt; entries.par_sort_unstable_by_key(|entry| entry.ino()) } #[cfg(not(unix))] fn sort_dir_entries_by_inode(_: &mut Vec<DirEntry>) { // do nothing } /// Sorts dir entries so that regular files are at the end. /// Because each worker's queue is a LIFO, the files would be picked up first and the /// dirs would be on the other side, amenable for stealing by other workers. fn sorted_entries(parent: Path, rd: ReadDir) -> impl Iterator<Item = Entry> { let mut files = vec![]; let mut links = vec![]; let mut dirs = vec![]; let path = Arc::new(parent); let mut entries: Vec<DirEntry> = rd.filter_map(|e| e.ok()).collect(); // Accessing entries in the order of identifiers should be faster on rotational drives Self::sort_dir_entries_by_inode(&mut entries); entries .into_iter() .filter_map(|e| Entry::from_dir_entry(&path, e).ok()) .for_each(|e| match e.tpe { EntryType::File => files.push(e), EntryType::SymLink => links.push(e), EntryType::Dir => dirs.push(e), EntryType::Other => {} }); dirs.into_iter().chain(links).chain(files) } /// Returns the absolute target path of a symbolic link with the type of the target fn resolve_link(&self, link: &Path) -> io::Result<(Path, EntryType)> { let link_buf = link.to_path_buf(); let target = read_link(&link_buf)?; let entry_type = EntryType::from(link_buf.metadata()?.file_type()); let target = Path::from(target); let resolved = if target.is_relative() { link.parent().unwrap().join(target) } else { target }; Ok((self.absolute(resolved), entry_type)) } /// Returns true if the file belongs to the given filesystem fn same_fs(&self, path: &Path, device: DeviceId) -> bool { match FileId::new(path) { Ok(file_id) => file_id.device == device, Err(err) => { self.log_warn(format!( "Cannot read device id of {}: {}", path.display(), err )); false } } } /// Returns absolute path with removed `.` and `..` components. /// Relative paths are resolved against `self.base_dir`. /// Symbolic links to directories are resolved. /// File symlinks are not resolved, because we need fn absolute(&self, mut path: Path) -> Path { if path.is_relative() { path = self.base_dir.join(path) } if path.to_path_buf().is_file() { // for files we are sure there will be a parent and a file name let parent = path.parent().unwrap().canonicalize(); let file_name = path.file_name().unwrap(); Arc::new(parent).join(Path::from(file_name)) } else { path.canonicalize() } } /// Logs a warning fn log_warn(&self, msg: String) { self.log.iter().for_each(|l| l.warn(&msg)) } } impl<'a> Default for Walk<'a> { fn default() -> Self { Self::new() } } #[cfg(test)] mod test { use std::fs::{create_dir, File}; use std::path::PathBuf; use std::sync::Mutex; use crate::util::test::*; use super::*; #[test] fn list_files() { with_dir("target/test/walk/1/", |test_root| { let file1 = test_root.join("file1.txt"); let file2 = test_root.join("file2.txt"); File::create(&file1).unwrap(); File::create(&file2).unwrap(); let walk = Walk::new(); assert_eq!(run_walk(walk, test_root.clone()), vec![file1, file2]); }); } #[test] fn descend_into_nested_dirs() { with_dir("target/test/walk/2/", |test_root| { let dir = test_root.join("dir"); create_dir(&dir).unwrap(); let file = dir.join("file.txt"); File::create(&file).unwrap(); let walk = Walk::new(); assert_eq!(run_walk(walk, test_root.clone()), vec![file]); }); } #[test] #[cfg(unix)] fn follow_rel_file_sym_links() { with_dir("target/test/walk/3/", |test_root| { use std::os::unix::fs::symlink; let file = test_root.join("file.txt"); let link = test_root.join("link"); File::create(&file).unwrap(); symlink(PathBuf::from("file.txt"), &link).unwrap(); // link -> file.txt let mut walk = Walk::new(); walk.follow_links = true; assert_eq!(run_walk(walk, link), vec![file]); }); } #[test] #[cfg(unix)] fn report_rel_file_sym_links() { with_dir("target/test/walk/report_symlinks/", |test_root| { use std::os::unix::fs::symlink; let file = test_root.join("file.txt"); let link1 = test_root.join("link1"); let link2 = test_root.join("link2"); File::create(file).unwrap(); symlink(PathBuf::from("file.txt"), &link1).unwrap(); // link1 -> file.txt symlink(PathBuf::from("link1"), &link2).unwrap(); // link2 -> link1 let mut walk1 = Walk::new(); walk1.report_links = true; assert_eq!(run_walk(walk1, link1.clone()), vec![link1]); // a link to a link should also be reported let mut walk2 = Walk::new(); walk2.report_links = true; assert_eq!(run_walk(walk2, link2.clone()), vec![link2]); }); } #[test] #[cfg(unix)] fn follow_rel_dir_sym_links() { with_dir("target/test/walk/4/", |test_root| { use std::os::unix::fs::symlink; let dir = test_root.join("dir"); let link = test_root.join("link"); let file = dir.join("file.txt"); create_dir(&dir).unwrap(); File::create(&file).unwrap(); symlink(PathBuf::from("dir"), &link).unwrap(); // link -> dir let mut walk = Walk::new(); walk.follow_links = true; assert_eq!(run_walk(walk, link), vec![file]); }); } #[test] #[cfg(unix)] fn follow_abs_dir_sym_links() { with_dir("target/test/walk/5/", |test_root| { use std::os::unix::fs::symlink; let dir = test_root.join("dir"); let link = test_root.join("link"); let file = dir.join("file.txt"); create_dir(&dir).unwrap(); File::create(&file).unwrap(); symlink(dir.canonicalize().unwrap(), &link).unwrap(); // link -> absolute path to dir let mut walk = Walk::new(); walk.follow_links = true; assert_eq!(run_walk(walk, link), vec![file]); }); } #[test] #[cfg(unix)] fn sym_link_cycles() { with_dir("target/test/walk/6/", |test_root| { use std::os::unix::fs::symlink; let dir = test_root.join("dir"); let link = dir.join("link"); let file = dir.join("file.txt"); create_dir(&dir).unwrap(); File::create(&file).unwrap(); // create a link back to the top level, so a cycle is formed symlink(test_root.canonicalize().unwrap(), link).unwrap(); let mut walk = Walk::new(); walk.follow_links = true; assert_eq!(run_walk(walk, test_root.clone()), vec![file]); }); } #[test] fn skip_hidden() { with_dir("target/test/walk/skip_hidden/", |test_root| { let hidden_dir = test_root.join(".dir"); create_dir(&hidden_dir).unwrap(); let hidden_file_1 = hidden_dir.join("file.txt"); let hidden_file_2 = test_root.join(".file.txt"); File::create(hidden_file_1).unwrap(); File::create(hidden_file_2).unwrap(); let mut walk = Walk::new(); walk.hidden = false; assert!(run_walk(walk, test_root.clone()).is_empty()); let mut walk = Walk::new(); walk.hidden = true; assert_eq!(run_walk(walk, test_root.clone()).len(), 2); }); } fn respect_ignore(root: &str, ignore_file: &str) { with_dir(root, |test_root| { use std::io::Write; let mut gitignore = File::create(test_root.join(ignore_file)).unwrap(); writeln!(gitignore, "foo/").unwrap(); writeln!(gitignore, "*.log").unwrap(); writeln!(gitignore, "**/bar").unwrap(); drop(gitignore); create_dir(test_root.join("foo")).unwrap(); create_file(&test_root.join("foo").join("bar")); create_file(&test_root.join("bar.log")); create_dir(test_root.join("dir")).unwrap(); create_dir(test_root.join("dir").join("bar")).unwrap(); create_file(&test_root.join("dir").join("bar").join("file")); let walk = Walk::new(); assert!(run_walk(walk, test_root.clone()).is_empty()); let mut walk = Walk::new(); walk.no_ignore = true; assert_eq!(run_walk(walk, test_root.clone()).len(), 3) }); } #[test] fn respect_gitignore() { respect_ignore("target/test/walk/gitignore/", ".gitignore") } #[test] fn respect_fdignore() { respect_ignore("target/test/walk/fdignore/", ".fdignore") } fn run_walk(walk: Walk, root: PathBuf) -> Vec<PathBuf> { let results = Mutex::new(Vec::new()); walk.run(vec![Path::from(root)], |path| { results.lock().unwrap().push(path.to_path_buf()) }); let mut results = results.into_inner().unwrap(); results.sort(); results } } 07070100000025000041ED000000000000000000000002653E86C200000000000000000000000000000000000000000000001E00000000fclones-0.34.0/gen-test-files07070100000026000081A4000000000000000000000001653E86C2000000F5000000000000000000000000000000000000002900000000fclones-0.34.0/gen-test-files/Cargo.toml[package] name = "gen-test-files" version = "0.1.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] clap = { version = "4.0", features = ["derive"] } rand = "0.8"07070100000027000041ED000000000000000000000002653E86C200000000000000000000000000000000000000000000002200000000fclones-0.34.0/gen-test-files/src07070100000028000081A4000000000000000000000001653E86C200000350000000000000000000000000000000000000002A00000000fclones-0.34.0/gen-test-files/src/main.rsuse clap::Parser; use rand::{thread_rng, Rng, RngCore}; use std::fs::File; use std::io::Write; use std::path::PathBuf; #[derive(clap::Parser)] struct CmdOptions { #[clap(short = 'n', long, default_value = "100")] count: usize, #[clap(short = 'c', long, default_value = "5")] max_group_size: usize, #[clap(default_value = ".")] target: PathBuf, } fn main() { let options = CmdOptions::parse(); let mut buf = [0u8; 65538]; for i in 0..options.count { let group_size: usize = thread_rng().gen_range(1..options.max_group_size); thread_rng().fill_bytes(&mut buf); for j in 0..group_size { let file_name = options.target.join(format!("file_{i}_{j}.txt")); let mut file = File::create(file_name).unwrap(); file.write_all(&buf).unwrap(); } } } 07070100000029000041ED000000000000000000000002653E86C200000000000000000000000000000000000000000000001900000000fclones-0.34.0/packaging0707010000002A000081A4000000000000000000000001653E86C20000022D000000000000000000000000000000000000002400000000fclones-0.34.0/packaging/DockerfileFROM rust:buster RUN apt-get update RUN apt-get install -y \ fakeroot \ alien \ gcc-mingw-w64-x86-64 \ gcc-x86-64-linux-gnu \ zip RUN rustup toolchain install stable-x86_64-pc-windows-gnu RUN rustup target add x86_64-unknown-linux-gnu RUN rustup target add x86_64-unknown-linux-musl RUN rustup target add i686-unknown-linux-musl RUN rustup target add x86_64-pc-windows-gnu RUN cargo install cargo-deb RUN mkdir /rpmbuild RUN chmod -R a+rwx /rpmbuild RUN chmod -R a+rwx /usr/local/cargo/registry COPY cargo-config.toml .cargo/config.toml 0707010000002B000081ED000000000000000000000001653E86C2000000A4000000000000000000000000000000000000002800000000fclones-0.34.0/packaging/build-image.sh#!/bin/sh # Builds a docker image used for generating various fclones packages IMAGE="pkolaczk/fclones-builder" docker build -t $IMAGE $(realpath "$(dirname $0)") 0707010000002C000081A4000000000000000000000001653E86C2000000C8000000000000000000000000000000000000002B00000000fclones-0.34.0/packaging/cargo-config.toml[target.x86_64-unknown-linux-gnu] linker = "x86_64-linux-gnu-gcc" [target.x86_64-unknown-linux-musl] linker = "x86_64-linux-gnu-gcc" [target.i686-unknown-linux-musl] linker = "x86_64-linux-gnu-gcc" 0707010000002D000081ED000000000000000000000001653E86C2000004F5000000000000000000000000000000000000002D00000000fclones-0.34.0/packaging/package-internal.sh#!/bin/bash set -e bold=$(tput bold) normal=$(tput sgr0) # This script generates packages for a release and places them in target/packages/<version>. # Don't use it directly, use package.sh instead. cd "$(dirname $0)/.." echo "${bold}Building${normal}" set -x cargo build --release --target x86_64-unknown-linux-gnu cargo build --release --target x86_64-unknown-linux-musl cargo build --release --target i686-unknown-linux-musl cargo build --release --target x86_64-pc-windows-gnu set +x echo "${bold}Packaging${normal} fclones" set -x VERSION=$(cargo pkgid -p fclones | sed 's/.*#//') PKG_DIR=target/packages/fclones-$VERSION mkdir -p $PKG_DIR rm -f $PKG_DIR/* cargo deb -p fclones --target x86_64-unknown-linux-gnu mv target/x86_64-unknown-linux-gnu/debian/*.deb $PKG_DIR fakeroot alien --to-rpm -c $PKG_DIR/*.deb mv *.rpm $PKG_DIR fakeroot alien --to-tgz -c $PKG_DIR/*.deb mv *.tgz $PKG_DIR/"fclones-$VERSION-linux-glibc-x86_64.tar.gz" tar -zcvf $PKG_DIR/"fclones-$VERSION-linux-musl-x86_64.tar.gz" target/x86_64-unknown-linux-musl/release/fclones tar -zcvf $PKG_DIR/"fclones-$VERSION-linux-musl-i686.tar.gz" target/i686-unknown-linux-musl/release/fclones zip -j $PKG_DIR/"fclones-$VERSION-windows-x86_64.zip" target/x86_64-pc-windows-gnu/release/fclones.exe 0707010000002E000081ED000000000000000000000001653E86C20000022F000000000000000000000000000000000000002400000000fclones-0.34.0/packaging/package.sh#!/bin/sh # This script generates packages for a release and places them in target/packages/<version>. # Uses docker container for better reproducibility of builds. # Additionally the pulled in docker container includes an older libc, therefore generated packages # will be compatible with older Linux distributions. FCLONES_HOME=$(realpath "$(dirname $0)/..") echo $FCLONES_HOME IMAGE="pkolaczk/fclones-builder" docker run \ -v "$FCLONES_HOME":/fclones \ -u $(id -u ${USER}):$(id -g ${USER}) \ -it $IMAGE /fclones/packaging/package-internal.sh 0707010000002F000081A4000000000000000000000001653E86C200000178000000000000000000000000000000000000001E00000000fclones-0.34.0/snapcraft.yamlname: fclones version: git summary: Efficient duplicate file finder and remover description: Finds duplicate, unique, under- or over-replicated files and offers many ways to get rid of them base: core18 confinement: strict grade: stable parts: fclones: plugin: rust source: . apps: fclones: command: bin/fclones plugs: - home - removable-media 07070100000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000B00000000TRAILER!!!1008 blocks
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor