Word Count (Map Reduce)
Using map reduce to count word frequency.
https://hadoop.apache.org/docs/r1.2.1/mapred\_tutorial.html\#Example%3A+WordCount+v1.0
Example
chunk1: "Google Bye GoodBye Hadoop code"
chunk2: "lintcode code Bye"
Get MapReduce result:
Bye: 2
GoodBye: 1
Google: 1
Hadoop: 1
code: 2
lintcode: 1
/**
* Definition of Input:
* template<class T>
* class Input {
* public:
* bool done();
* // Returns true if the iteration has elements or false.
* void next();
* // Move to the next element in the iteration
* // Runtime error if the iteration has no more elements
* T value();
* // Get the current element, Runtime error if
* // the iteration has no more elements
* }
*/
class WordCountMapper: public Mapper {
public:
void Map(Input<string>* input) {
// Write your code here
// Please directly use func 'output' to
// output the results into output buffer.
// void output(string &key, int value);
while (!input->done()) {
vector<string> words = split(input->value());
for (string& word: words) {
output(word, 1);
}
input->next();
}
}
private:
vector<string> split(string s) {
istringstream iss(s);
vector<string> result;
while (iss) {
string word;
iss >> word;
if (word.size() > 0) {
result.push_back(word);
}
}
return result;
}
};
class WordCountReducer: public Reducer {
public:
void Reduce(string &key, Input<int>* input) {
// Write your code here
// Please directly use func 'output' to
// output the results into output buffer.
// void output(string &key, int value);
int sum = 0;
while (!input->done()) {
sum++;
input->next();
}
output(key, sum);
}
};
Top K Frequent Words (Map Reduce)
Find top k frequent words with map reduce framework.
The mapper's key is the document id, value is the content of the document, words in a document are split by spaces.
For reducer, the output should be at most k key-value pairs, which are the top k words and their frequencies in this reducer. The judge will take care about how to merge different reducers' results to get the global top k frequent words, so you don't need to care about that part.
The_k_is given in the constructor of TopK class.
Notice
For the words with same frequency, rank them with alphabet.
Example
Given document A =
lintcode is the best online judge
I love lintcode
and document B =
lintcode is an online judge for coding interview
you can test your code online at lintcode
The top 2 words and their frequencies should be
lintcode, 4
online, 3
/**
* Definition of Input:
* template<class T>
* class Input {
* public:
* bool done();
* // Returns true if the iteration has elements or false.
* void next();
* // Move to the next element in the iteration
* // Runtime error if the iteration has no more elements
* T value();
* // Get the current element, Runtime error if
* // the iteration has no more elements
* }
* Definition of Document:
* class Document {
* public:
* int id; // document id
* string content; // document content
* }
*/
class TopKFrequentWordsMapper: public Mapper {
public:
void Map(Input<Document>* input) {
// Write your code here
// Please directly use func 'output' to output
// the results into output buffer.
// void output(string &key, int value);
while (!input->done()) {
vector<string> words = split(input->value().content);
for (string& word : words) {
if (word.size() > 0) {
output(word, 1);
}
}
input->next();
}
}
private:
vector<string> split(string s) {
istringstream iss(s);
vector<string> result;
while (iss) {
string word;
iss >> word;
if (word.size() > 0) {
result.push_back(word);
}
}
return result;
}
};
class Pair {
public:
string key;
int value;
Pair(string word, int times) {
key = word;
value = times;
};
bool operator<(const Pair& obj) const {
return value > obj.value || value == obj.value && key < obj.key;
}
};
class TopKFrequentWordsReducer: public Reducer {
private:
int k;
priority_queue<Pair> q;
public:
void setUp(int k) {
// initialize your data structure here
this->k = k;
}
void Reduce(string &key, Input<int>* input) {
// Write your code here
int sum = 0;
while (!input->done()) {
sum += input->value();
input->next();
}
Pair pair(key, sum);
if (q.size() < k)
q.push(pair);
else {
q.push(pair);
q.pop();
}
}
void cleanUp() {
// Please directly use func 'output' to output
// the top k pairs <word, times> into output buffer.
// void output(string &key, int &value);
vector<Pair> list;
while (!q.empty()) {
list.push_back(q.top());
q.pop();
}
// reverse
int n = list.size();
for (int i = n - 1; i >=0; --i)
output(list[i].key, list[i].value);
}
};
Inverted Index (Map Reduce)
Use map reduce to build inverted index for given documents.
/**
* Definition of Input:
* template<class T>
* class Input {
* public:
* bool done();
* // Returns true if the iteration has elements or false.
* void next();
* // Move to the next element in the iteration
* // Runtime error if the iteration has no more elements
* T value();
* // Get the current element, Runtime error if
* // the iteration has no more elements
* }
* Definition of Document:
* class Document {
* public:
* int id; // document id
* string content; // document content
* }
*/
class InvertedIndexMapper: public Mapper {
public:
void Map(Input<Document>* input) {
while (!input->done()) {
Document doc = input->value();
vector<string> keys = split(doc.content);
for (string s : keys) {
output(s, doc.id);
}
input->next();
}
}
private:
vector<string> split(string s) {
istringstream iss(s);
vector<string> result;
while (iss) {
string word;
iss >> word;
if (word.size() > 0) {
result.push_back(word);
}
}
return result;
}
};
class InvertedIndexReducer: public Reducer {
public:
void Reduce(string &key, Input<int>* input) {
int last = -1;
vector<int> result;
while (!input->done()) {
int id = input->value();
if (id != last) {
last = id;
result.push_back(id);
}
input->next();
}
output(key, result);
}
};
Anagram (Map Reduce)
Use Map Reduce to find anagrams in a given list of words.
Example
Given["lint", "intl", "inlt", "code"]
, return["lint", "inlt", "intl"],["code"]
.
Given["ab", "ba", "cd", "dc", "e"]
, return["ab", "ba"], ["cd", "dc"], ["e"]
.
/**
* Definition of Input:
* template<class T>
* class Input {
* public:
* bool done();
* // Returns true if the iteration has elements or false.
* void next();
* // Move to the next element in the iteration
* // Runtime error if the iteration has no more elements
* T value();
* // Get the current element, Runtime error if
* // the iteration has no more elements
* }
*/
class AnagramMapper: public Mapper {
public:
void Map(Input<string>* input) {
while (!input->done()) {
vector<string> words = split(input->value());
for (string word : words) {
string key = word;
sort(key.begin(), key.end());
output(key, word);
}
input->next();
}
}
private:
vector<string> split(string s) {
istringstream iss(s);
vector<string> result;
while (iss) {
string word;
iss >> word;
if (word.size() > 0) {
result.push_back(word);
}
}
return result;
}
};
class AnagramReducer: public Reducer {
public:
void Reduce(string &key, Input<string>* input) {
vector<string> result;
while (!input->done()) {
result.push_back(input->value());
input->next();
}
output(key, result);
}
};