#include #include #include // for shared_ptr<> #include #include #include #include // for lower_bound() #include // for next() and prev() using namespace std; class LCS { protected: // This linked list class is used to trace the LCS candidates class Pair { public: uint32_t index1; uint32_t index2; shared_ptr next; Pair(uint32_t index1, uint32_t index2, shared_ptr next = nullptr) : index1(index1), index2(index2), next(next) { } static shared_ptr Reverse(const shared_ptr pairs) { shared_ptr head = nullptr; for (auto next = pairs; next != nullptr; next = next->next) head = make_shared(next->index1, next->index2, head); return head; } }; typedef deque> PAIRS; typedef deque THRESHOLD; typedef deque INDEXES; typedef map CHAR2INDEXES; typedef deque MATCHES; // return the LCS as a linked list of matched index pairs uint32_t Pairs(MATCHES& matches, shared_ptr *pairs) { auto trace = pairs != nullptr; PAIRS traces; THRESHOLD threshold; // //[Assert]After each index1 iteration threshold[index3] is the least index2 // such that the LCS of s1[0:index1] and s2[0:index2] has length index3 + 1 // uint32_t index1 = 0; for (const auto& it1 : matches) { if (!it1->empty()) { auto dq2 = *it1; auto limit = threshold.end(); for (auto it2 = dq2.rbegin(); it2 != dq2.rend(); it2++) { // Each of the index1, index2 pairs considered here correspond to a match auto index2 = *it2; // // Note: The reverse iterator it2 visits index2 values in descending order, // allowing thresholds to be updated in-place. std::lower_bound() is used // to perform a binary search. // limit = lower_bound(threshold.begin(), limit, index2); auto index3 = distance(threshold.begin(), limit); // // Look ahead to the next index2 value to optimize space used in the Hunt // and Szymanski algorithm. If the next index2 is also an improvement on // the value currently held in threshold[index3], a new Pair will only be // superseded on the next index2 iteration. // // Depending on match redundancy, the number of Pair constructions may be // divided by factors ranging from 2 up to 10 or more. // auto skip = next(it2) != dq2.rend() && (limit == threshold.begin() || *prev(limit) < *next(it2)); if (skip) continue; if (limit == threshold.end()) { // insert case threshold.push_back(index2); // Refresh limit iterator: limit = prev(threshold.end()); if (trace) { auto prefix = index3 > 0 ? traces[index3 - 1] : nullptr; auto last = make_shared(index1, index2, prefix); traces.push_back(last); } } else if (index2 < *limit) { // replacement case *limit = index2; if (trace) { auto prefix = index3 > 0 ? traces[index3 - 1] : nullptr; auto last = make_shared(index1, index2, prefix); traces[index3] = last; } } } // next index2 } index1++; } // next index1 if (trace) { auto last = traces.size() > 0 ? traces.back() : nullptr; // Reverse longest back-trace *pairs = Pair::Reverse(last); } auto length = threshold.size(); return length; } // // Match() avoids incurring m*n comparisons by using the associative // memory implemented by CHAR2INDEXES to achieve O(m+n) performance, // where m and n are the input lengths. // // The lookup time can be assumed constant in the case of characters. // The symbol space is larger in the case of records; but the lookup // time will be O(log(m+n)), at most. // void Match(CHAR2INDEXES& indexes, MATCHES& matches, const string& s1, const string& s2) { uint32_t index = 0; for (const auto& it : s2) indexes[it].push_back(index++); for (const auto& it : s1) { auto& dq2 = indexes[it]; matches.push_back(&dq2); } } string Select(shared_ptr pairs, uint32_t length, bool right, const string& s1, const string& s2) { string buffer; buffer.reserve(length); for (auto next = pairs; next != nullptr; next = next->next) { auto c = right ? s2[next->index2] : s1[next->index1]; buffer.push_back(c); } return buffer; } public: string Correspondence(const string& s1, const string& s2) { CHAR2INDEXES indexes; MATCHES matches; // holds references into indexes Match(indexes, matches, s1, s2); shared_ptr pairs; // obtain the LCS as index pairs auto length = Pairs(matches, &pairs); return Select(pairs, length, false, s1, s2); } };