Skip to content

Commit

Permalink
更新15.mapreduce的使用注解
Browse files Browse the repository at this point in the history
  • Loading branch information
jiangzhonglian committed Apr 8, 2017
1 parent 87836f0 commit 6bdfd5a
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 12 deletions.
18 changes: 11 additions & 7 deletions src/python/15.BigData_MapReduce/mrMean.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,34 @@
#!/usr/bin/python
# coding:utf8

'''
Created on 2017-04-07
@author: Peter/ApacheCN-xy
'''
from mrjob.job import MRJob


class MRmean(MRJob):
def __init__(self, *args, **kwargs): # 对数据初始化
super(MRmean, self).__init__(*args, **kwargs)
self.inCount = 0
self.inSum = 0
self.inSqSum = 0

def map(self, key, val): # 需要 2 个参数,求数据的和与平方和
if False: yield
inVal = float(val)
self.inCount += 1
self.inSum += inVal
self.inSqSum += inVal*inVal

def map_final(self): # 计算数据的平均值,平方的均值,并返回
mn = self.inSum/self.inCount
mnSq = self.inSqSum/self.inCount
yield (1, [self.inCount, mn, mnSq])

def reduce(self, key, packedValues): #
def reduce(self, key, packedValues):
cumVal=0.0; cumSumSq=0.0; cumN=0.0
for valArr in packedValues: # 从输入流中获取值
nj = float(valArr[0])
Expand All @@ -34,10 +38,10 @@ def reduce(self, key, packedValues): #
mean = cumVal/cumN
var = (cumSumSq - 2*mean*cumVal + cumN*mean*mean)/cumN
yield (mean, var) # 发出平均值和方差

def steps(self):
return ([self.mr(mapper=self.map, mapper_final=self.map_final,\
reducer=self.reduce,)])
return ([self.mr(mapper=self.map, mapper_final=self.map_final, reducer=self.reduce,)])


if __name__ == '__main__':
MRmean.run()
MRmean.run()
4 changes: 3 additions & 1 deletion src/python/15.BigData_MapReduce/mrMeanMapper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/python
# coding:utf8
'''
Created on 2017-04-06
Machine Learning in Action Chapter 18
Expand Down Expand Up @@ -30,4 +32,4 @@ def read_input(file):

# 输出 数据的个数,n个数据的均值,n个数据平方之后的均值
print ("%d\t%f\t%f" % (numInputs, mean(input), mean(sqInput))) #计算均值
print ("report: still alive", file=sys.stderr)
print >> sys.stderr, "report: still alive"
11 changes: 7 additions & 4 deletions src/python/15.BigData_MapReduce/mrMeanReducer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/python
# coding:utf8

'''
Created on 2017-04-06
Machine Learning in Action Chapter 18
Expand All @@ -7,9 +10,9 @@


'''
mapper 接受原始的输入并产生中间值传递给 reducer。
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
即:将中间的 key/value 对进行组合。
mapper 接受原始的输入并产生中间值传递给 reducer。
很多的mapper是并行执行的,所以需要将这些mapper的输出合并成一个值。
即:将中间的 key/value 对进行组合。
'''
import sys
from numpy import mat, mean, power
Expand Down Expand Up @@ -40,4 +43,4 @@ def read_input(file):

#输出 数据总量,均值,平方的均值(方差)
print ("%d\t%f\t%f" % (cumN, mean, meanSq))
print ("report: still alive", file=sys.stderr)
print >> sys.stderr, "report: still alive"

0 comments on commit 6bdfd5a

Please sign in to comment.