|
1 | | -from pyspark import SparkContext |
| 1 | +import sys |
| 2 | +sys.path.insert(0, '.') |
| 3 | +from pyspark import SparkContext, SparkConf |
| 4 | +from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount |
2 | 5 |
|
3 | 6 | if __name__ == "__main__": |
4 | | - |
5 | | - sc = SparkContext("local", "avgHousePrice") |
6 | | - sc.setLogLevel("ERROR") |
| 7 | + conf = SparkConf().setAppName("avgHousePrice").setMaster("local[3]") |
| 8 | + sc = SparkContext(conf = conf) |
7 | 9 |
|
8 | 10 | lines = sc.textFile("in/RealEstate.csv") |
9 | 11 | cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) |
10 | 12 |
|
11 | 13 | housePricePairRdd = cleanedLines.map(lambda line: \ |
12 | | - (line.split(",")[3], (1, float(line.split(",")[2])))) |
| 14 | + (line.split(",")[3], AvgCount(1, float(line.split(",")[2])))) |
13 | 15 |
|
14 | 16 | housePriceTotal = housePricePairRdd \ |
15 | | - .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) |
| 17 | + .reduceByKey(lambda x, y: AvgCount(x.count + y.count, x.total + y.total)) |
16 | 18 |
|
17 | 19 | print("housePriceTotal: ") |
18 | | - for bedroom, total in housePriceTotal.collect(): |
19 | | - print("{} : {}".format(bedroom, total)) |
| 20 | + for bedroom, avgCount in housePriceTotal.collect(): |
| 21 | + print("{} : ({}, {})".format(bedroom, avgCount.count, avgCount.total)) |
20 | 22 |
|
21 | | - housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0]) |
| 23 | + housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount.total / avgCount.count) |
22 | 24 | print("\nhousePriceAvg: ") |
23 | 25 | for bedroom, avg in housePriceAvg.collect(): |
24 | 26 | print("{} : {}".format(bedroom, avg)) |
0 commit comments