187. Repeated DNA Sequences

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

Example:

Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"

Output: ["AAAAACCCCC", "CCCCCAAAAA"]
// Hashmap
vector<string> findRepeatedDnaSequences(string s) { // time: O(n); space: O(n)
    unordered_map<string, int> mp;
    vector<string> res;
    string tmp;
    for (int i = 0; i + 9 < s.length(); ++i) {
        tmp = s.substr(i, 10);
        if (mp[tmp]++ == 1) res.emplace_back(tmp);
    }
    return res;
}
// Hashset
vector<string> findRepeatedDnaSequences(string s) { // time: O(n); space: O(n)
    unordered_set<string> seen, repeated;
    vector<string> res;
    string tmp;
    for (int i = 0; i + 9 < s.length(); ++i) {
        tmp = s.substr(i, 10);
        if (seen.count(tmp)) repeated.insert(tmp);
        else seen.insert(tmp);
    }
    for (const string str : repeated) res.emplace_back(str);
    return res;
}
// Bit Manipulation
vector<string> findRepeatedDnaSequences(string s) { // time: O(n); space: O(n)
    vector<string> res;
    unordered_set<int> seen, repeated;
    vector<int> char_map(26);
    // char_map['A' - 'A'] = 0; // 00
    char_map['C' - 'A'] = 1; // 01
    char_map['G' - 'A'] = 2; // 10
    char_map['T' - 'A'] = 3; // 11
    int hash_val = 0;
    for (int i = 0; i < s.length(); ++i) {
        hash_val <<= 2;
        hash_val |= char_map[s[i] - 'A'];
        hash_val &= 0xfffff; // f: 1111
        if (i < 9) continue;
        if (seen.count(hash_val)) {
            if (!repeated.count(hash_val)) {
                res.emplace_back(s.substr(i - 9, 10));
                repeated.insert(hash_val);
            }
        } else {
            seen.insert(hash_val);
        }
    }
    return res;
}

Last updated

Was this helpful?