All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
Copy Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC", "CCCCCAAAAA"]
Copy // Hashmap
vector<string> findRepeatedDnaSequences(string s) { // time: O(n); space: O(n)
unordered_map<string, int> mp;
vector<string> res;
string tmp;
for (int i = 0; i + 9 < s.length(); ++i) {
tmp = s.substr(i, 10);
if (mp[tmp]++ == 1) res.emplace_back(tmp);
}
return res;
}
Copy // Hashset
vector<string> findRepeatedDnaSequences(string s) { // time: O(n); space: O(n)
unordered_set<string> seen, repeated;
vector<string> res;
string tmp;
for (int i = 0; i + 9 < s.length(); ++i) {
tmp = s.substr(i, 10);
if (seen.count(tmp)) repeated.insert(tmp);
else seen.insert(tmp);
}
for (const string str : repeated) res.emplace_back(str);
return res;
}
Copy // Bit Manipulation
vector<string> findRepeatedDnaSequences(string s) { // time: O(n); space: O(n)
vector<string> res;
unordered_set<int> seen, repeated;
vector<int> char_map(26);
// char_map['A' - 'A'] = 0; // 00
char_map['C' - 'A'] = 1; // 01
char_map['G' - 'A'] = 2; // 10
char_map['T' - 'A'] = 3; // 11
int hash_val = 0;
for (int i = 0; i < s.length(); ++i) {
hash_val <<= 2;
hash_val |= char_map[s[i] - 'A'];
hash_val &= 0xfffff; // f: 1111
if (i < 9) continue;
if (seen.count(hash_val)) {
if (!repeated.count(hash_val)) {
res.emplace_back(s.substr(i - 9, 10));
repeated.insert(hash_val);
}
} else {
seen.insert(hash_val);
}
}
return res;
}