All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example, Given s = "AAAAACCCCCAAAAACCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
Solution:
Since there are only four possibilities for each char, we could use 2 bits to represent one char and assign them value like: 00 for ‘A’, 01 for ‘C’, 10 for ‘G’ and 11 for ‘T’.
Thus each 10-char substring could be transformed into 20 bits. We use a 32-bit int to represent the 10-char sequence. After we have the first 10 chars and move forward, each step we add a new char and remove the left-most char in the window. We could use some bit operations to represent this update.
Whenever we calculated an int value, we insert it into a map, which counts the occurrence of each int.
Finally we find out all the int value that appears more than once and transform them back to 10-char string.
My code:
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { vector<string> out; if (s.size() < 10) return out; unordered_map<int, int> map; string temps = s.substr(0, 10); int temphash = stringHash(temps); map[temphash]++; for (int i = 10; i < s.size(); i++) { int newhash = updateHash(s, temphash, i); //string temps = s.substr(i - 9, 10); //int newhash = stringHash(temps); map[newhash]++; temphash = newhash; } for (auto it = map.begin(); it != map.end(); it++) { if (it -> second > 1) { string outstring = hashToStr(it -> first); out.push_back(outstring); } } return out; } int stringHash(string &s) { // s.size() == 10; int out = 0; for (int i = 0; i < 10; i++) { char c = s[i]; if (c == 'C') { out |= 1; } else if (c == 'G') { out |= 2; } else if (c == 'T') { out |= 3; } out <<= 2; } out >>= 2; return out; } int updateHash(string &s, int oldhash, int newi) { int mask = ~(3 << 18); oldhash &= mask; oldhash <<= 2; char newc = s[newi]; if (newc == 'C') oldhash |= 1; else if (newc == 'G') oldhash |= 2; else if (newc == 'T') oldhash |= 3; return oldhash; } string hashToStr(int hash) { string out = ""; int mask = 3; for (int i = 0; i < 10; i++) { int twodigit = hash & mask; if (twodigit == 0) { out = 'A' + out; } else if (twodigit == 1) { out = 'C' + out; } else if (twodigit == 2) { out = 'G' + out; } else { out = 'T' + out; } hash >>= 2; } return out; } };