I am trying to write a custom reporting command that finds the top words. It seems to work but I see some data isn't transfered to reducer from mapper. For example, I process 10 events and produced 100 words on each mapper invocation, the reducer should get 100 x mapper times words to process, but it doesn't happen, some of the words yielded by the mapper cannot be accessed by the reducer.
My mapper and reducer implementation is below.
@Configuration()
def map(self, records):
self.logger.debug('TopWordsCommand.map')
fieldname = self.field
total = {}
cnt = 0
word_cnt = 0
for record in records:
text = record[fieldname]
for word in text.split():
if word in total:
total[word] = int(total[word]) + 1
else:
total[word] = 1
word_cnt += 1
cnt += 1
for word, count in total.iteritems():
yield { 'word': word, 'count': count }
self.logger.info('Finished map. Processed {} events and {} words.'.format(cnt, word_cnt))
def reduce(self, records):
self.logger.debug('TopWordsCommand.reduce')
total = {}
word_cnt = 0
uniq_word_cnt = 0
for record in records:
word = record['word']
count = record['count']
word_cnt += 1
if word in total:
total[word] += int(count)
else:
total[word] = int(count)
uniq_word_cnt += 1
for word, count in total.iteritems():
yield { 'word': word, 'count': count }
self.logger.info("Finished reduce. Total number of words {}, unique words {}".format(word_cnt, uniq_word_cnt))
↧