Word Count (Map Reduce)

Using map reduce to count word frequency.

https://hadoop.apache.org/docs/r1.2.1/mapred\_tutorial.html\#Example%3A+WordCount+v1.0

Example

chunk1: "Google Bye GoodBye Hadoop code"
chunk2: "lintcode code Bye"


Get MapReduce result:
    Bye: 2
    GoodBye: 1
    Google: 1
    Hadoop: 1
    code: 2
    lintcode: 1

/**
 * Definition of Input:
 * template<class T>
 * class Input {
 * public:
 *     bool done(); 
 *         // Returns true if the iteration has elements or false.
 *     void next();
 *         // Move to the next element in the iteration
 *         // Runtime error if the iteration has no more elements
 *     T value();
 *        // Get the current element, Runtime error if
 *        // the iteration has no more elements
 * }
 */
class WordCountMapper: public Mapper {
public:
    void Map(Input<string>* input) {
        // Write your code here
        // Please directly use func 'output' to 
        // output the results into output buffer.
        // void output(string &key, int value);
        while (!input->done()) {
            vector<string> words = split(input->value());
            for (string& word: words) {
                output(word, 1);
            }
            input->next();
        }
    }

private: 
    vector<string> split(string s) {
        istringstream iss(s);
        vector<string> result;
        while (iss) {
            string word;
            iss >> word;
            if (word.size() > 0) {
                result.push_back(word);
            }
        }
        return result;
    }
};


class WordCountReducer: public Reducer {
public:
    void Reduce(string &key, Input<int>* input) {
        // Write your code here
        // Please directly use func 'output' to 
        // output the results into output buffer.
        // void output(string &key, int value);
        int sum = 0;
        while (!input->done()) {
            sum++;
            input->next();
        }
        output(key, sum);
    }
};

Top K Frequent Words (Map Reduce)

Find top k frequent words with map reduce framework.

The mapper's key is the document id, value is the content of the document, words in a document are split by spaces.

For reducer, the output should be at most k key-value pairs, which are the top k words and their frequencies in this reducer. The judge will take care about how to merge different reducers' results to get the global top k frequent words, so you don't need to care about that part.

The_k_is given in the constructor of TopK class.

Notice

For the words with same frequency, rank them with alphabet.

Example

Given document A =

lintcode is the best online judge
I love lintcode

and document B =

lintcode is an online judge for coding interview
you can test your code online at lintcode

The top 2 words and their frequencies should be

lintcode, 4
online, 3

/**
 * Definition of Input:
 * template<class T>
 * class Input {
 * public:
 *     bool done(); 
 *         // Returns true if the iteration has elements or false.
 *     void next();
 *         // Move to the next element in the iteration
 *         // Runtime error if the iteration has no more elements
 *     T value();
 *        // Get the current element, Runtime error if
 *        // the iteration has no more elements
 * }
 * Definition of Document:
 * class Document {
 * public:
 *     int id; // document id
 *     string content; // document content
 * }
 */
class TopKFrequentWordsMapper: public Mapper {
public:
    void Map(Input<Document>* input) {
        // Write your code here
        // Please directly use func 'output' to output 
        // the results into output buffer.
        // void output(string &key, int value);
        while (!input->done()) {
            vector<string> words = split(input->value().content);
            for (string& word : words) {
                if (word.size() > 0) {
                    output(word, 1);
                }
            }
            input->next();
        }
    }

private: 
    vector<string> split(string s) {
        istringstream iss(s);
        vector<string> result;
        while (iss) {
            string word;
            iss >> word;
            if (word.size() > 0) {
                result.push_back(word);
            }
        }
        return result;
    }
};

class Pair {
public:
    string key;
    int value;
    Pair(string word, int times) {
        key = word;
        value = times;
    };

    bool operator<(const Pair& obj) const {
        return value > obj.value || value == obj.value && key < obj.key;
    }
};

class TopKFrequentWordsReducer: public Reducer {
private:
    int k;
    priority_queue<Pair> q;
public:
    void setUp(int k) {
        // initialize your data structure here
        this->k = k;
    }

    void Reduce(string &key, Input<int>* input) {
        // Write your code here
        int sum = 0;
        while (!input->done()) {
            sum += input->value();
            input->next();
        }

        Pair pair(key, sum);
        if (q.size() < k)
            q.push(pair);
        else {
            q.push(pair);
            q.pop();
        }        
    }

    void cleanUp() {
        // Please directly use func 'output' to output 
        // the top k pairs <word, times> into output buffer.
        // void output(string &key, int &value);
        vector<Pair> list;
        while (!q.empty()) {
            list.push_back(q.top());
            q.pop();
        }

        // reverse
        int n = list.size();
        for (int i = n - 1; i >=0; --i)
            output(list[i].key, list[i].value);
    }
};

Inverted Index (Map Reduce)

Use map reduce to build inverted index for given documents.

/**
 * Definition of Input:
 * template<class T>
 * class Input {
 * public:
 *     bool done(); 
 *         // Returns true if the iteration has elements or false.
 *     void next();
 *         // Move to the next element in the iteration
 *         // Runtime error if the iteration has no more elements
 *     T value();
 *        // Get the current element, Runtime error if
 *        // the iteration has no more elements
 * }
 * Definition of Document:
 * class Document {
 * public:
 *     int id; // document id
 *     string content; // document content
 * }
 */
class InvertedIndexMapper: public Mapper {
public:
    void Map(Input<Document>* input) {
        while (!input->done()) {
            Document doc = input->value();
            vector<string> keys = split(doc.content);
            for (string s : keys) {
                output(s, doc.id);
            }
            input->next();
        }
    }
private: 
    vector<string> split(string s) {
        istringstream iss(s);
        vector<string> result;
        while (iss) {
            string word;
            iss >> word;
            if (word.size() > 0) {
                result.push_back(word);
            }
        }
        return result;
    }
};


class InvertedIndexReducer: public Reducer {
public:
    void Reduce(string &key, Input<int>* input) {
        int last = -1;
        vector<int> result;
        while (!input->done()) {
            int id = input->value();
            if (id != last) {
                last = id;
                result.push_back(id);
            }
            input->next();
        }
        output(key, result);
    }
};

Anagram (Map Reduce)

Use Map Reduce to find anagrams in a given list of words.

Example

Given["lint", "intl", "inlt", "code"], return["lint", "inlt", "intl"],["code"].

Given["ab", "ba", "cd", "dc", "e"], return["ab", "ba"], ["cd", "dc"], ["e"].

/**
 * Definition of Input:
 * template<class T>
 * class Input {
 * public:
 *     bool done(); 
 *         // Returns true if the iteration has elements or false.
 *     void next();
 *         // Move to the next element in the iteration
 *         // Runtime error if the iteration has no more elements
 *     T value();
 *        // Get the current element, Runtime error if
 *        // the iteration has no more elements
 * }
 */
class AnagramMapper: public Mapper {
public:
    void Map(Input<string>* input) {
        while (!input->done()) {
            vector<string> words = split(input->value());
            for (string word : words) {
                string key = word;
                sort(key.begin(), key.end());
                output(key, word);
            }
            input->next();
        }
    }

private:
    vector<string> split(string s) {
        istringstream iss(s);
        vector<string> result;
        while (iss) {
            string word;
            iss >> word;
            if (word.size() > 0) {
                result.push_back(word);
            }
        }
        return result;
    }
};


class AnagramReducer: public Reducer {
public:
    void Reduce(string &key, Input<string>* input) {
        vector<string> result;
        while (!input->done()) {
            result.push_back(input->value());
            input->next();
        }
        output(key, result);
    }
};

Map reduce

Word Count (Map Reduce)

Top K Frequent Words (Map Reduce)

Notice

Inverted Index (Map Reduce)

Anagram (Map Reduce)

results matching ""

No results matching ""