RosettaCodeData/Task/Find-duplicate-files/C++/find-duplicate-files.cpp

147 lines
5.0 KiB
C++

#include<iostream>
#include<string>
#include<boost/filesystem.hpp>
#include<boost/format.hpp>
#include<boost/iostreams/device/mapped_file.hpp>
#include<optional>
#include<algorithm>
#include<iterator>
#include<execution>
#include"dependencies/xxhash.hpp" // https://github.com/RedSpah/xxhash_cpp
/**
* Find ranges (neighbouring elements) of the same value within [begin, end[ and
* call callback for each such range
* @param begin start of container
* @param end end of container (1 beyond last element)
* @param function returns value for each iterator V(*T&)
* @param callback void(start, end, value)
* @return number of range
*/
template<typename T, typename V, typename F>
size_t for_each_adjacent_range(T begin, T end, V getvalue, F callback) {
size_t partitions = 0;
while (begin != end) {
auto const& value = getvalue(*begin);
auto current = begin;
while (++current != end && getvalue(*current) == value);
callback(begin, current, value);
++partitions;
begin = current;
}
return partitions;
}
namespace bi = boost::iostreams;
namespace fs = boost::filesystem;
struct file_entry {
public:
explicit file_entry(fs::directory_entry const & entry)
: path_{entry.path()}, size_{fs::file_size(entry)}
{}
auto size() const { return size_; }
auto const& path() const { return path_; }
auto get_hash() {
if (!hash_)
hash_ = compute_hash();
return *hash_;
}
private:
xxh::hash64_t compute_hash() {
bi::mapped_file_source source;
source.open<fs::wpath>(this->path());
if (!source.is_open()) {
std::cerr << "Cannot open " << path() << std::endl;
throw std::runtime_error("Cannot open file");
}
xxh::hash_state64_t hash_stream;
hash_stream.update(source.data(), size_);
return hash_stream.digest();
}
private:
fs::wpath path_;
uintmax_t size_;
std::optional<xxh::hash64_t> hash_;
};
using vector_type = std::vector<file_entry>;
using iterator_type = vector_type::iterator;
auto find_files_in_dir(fs::wpath const& path, vector_type& file_vector, uintmax_t min_size = 1) {
size_t found = 0, ignored = 0;
if (!fs::is_directory(path)) {
std::cerr << path << " is not a directory!" << std::endl;
}
else {
std::cerr << "Searching " << path << std::endl;
for (auto& e : fs::recursive_directory_iterator(path)) {
++found;
if (fs::is_regular_file(e) && fs::file_size(e) >= min_size)
file_vector.emplace_back(e);
else ++ignored;
}
}
return std::make_tuple(found, ignored);
}
int main(int argn, char* argv[])
{
vector_type files;
for (auto i = 1; i < argn; ++i) {
fs::wpath path(argv[i]);
auto [found, ignored] = find_files_in_dir(path, files);
std::cerr << boost::format{
" %1$6d files found\n"
" %2$6d files ignored\n"
" %3$6d files added\n" } % found % ignored % (found - ignored)
<< std::endl;
}
std::cerr << "Found " << files.size() << " regular files" << std::endl;
// sort files in descending order by file size
std::sort(std::execution::par_unseq, files.begin(), files.end()
, [](auto const& a, auto const& b) { return a.size() > b.size(); }
);
for_each_adjacent_range(
std::begin(files)
, std::end(files)
, [](vector_type::value_type const& f) { return f.size(); }
, [](auto start, auto end, auto file_size) {
// Files with same size
size_t nr_of_files = std::distance(start, end);
if (nr_of_files > 1) {
// sort range start-end by hash
std::sort(start, end, [](auto& a, auto& b) {
auto const& ha = a.get_hash();
auto const& hb = b.get_hash();
auto const& pa = a.path();
auto const& pb = b.path();
return std::tie(ha, pa) < std::tie(hb, pb);
});
for_each_adjacent_range(
start
, end
, [](vector_type::value_type& f) { return f.get_hash(); }
, [file_size](auto hstart, auto hend, auto hash) {
// Files with same size and same hash are assumed to be identical
// could resort to compare files byte-by-byte now
size_t hnr_of_files = std::distance(hstart, hend);
if (hnr_of_files > 1) {
std::cout << boost::format{ "%1$3d files with hash %3$016x and size %2$d\n" }
% hnr_of_files % file_size % hash;
std::for_each(hstart, hend, [hash, file_size](auto& e) {
std::cout << '\t' << e.path() << '\n';
}
);
}
}
);
}
}
);
return 0;
}