25. +∞ B 1 +∞
E
5 1
0 +∞ 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
26. +∞ B 1 +∞
E
5 1
0 +∞ 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
27. +∞ B 1 +∞
E
5 1
0 +∞ 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
28. 5 1 +∞
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
29. 5 1 +∞
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
2
30. 5 1 +∞
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
2
31. 4 1 6
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
6 C 5 F 5
2
32. 4 1 6
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
6 C 5 F 5
3
33. 4 1 6
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
6 C 5 F 5
3
34. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
3
35. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
end
36. class ShortestPathMapper(Mapper)
def map(self, node_id, Node):
# send graph structure
emit node_id, Node
# get node value and add it to edge distance
dist = Node.get_value()
for neighbour_node_id in Node.get_adjacency_list():
dist_to_nbr = Node.get_distance(
node_id, neighbour_node_id )
emit neighbour_node_id, dist + dist_to_nbr
37. class ShortestPathReducer(Reducer):
def reduce(self, node_id, dist_list):
min_dist = sys.maxint
for dist in dist_list:
# dist_list contains a Node
if is_node(dist):
Node = dist
elif dist < min_dist:
min_dist = dist
Node.set_value(min_dist)
" emit node_id, Node
42. # In-Mapper Combiner
class ShortestPathMapper(Mapper):
def __init__(self):
self.buffer = {}
def check_and_put(self, key, value):
if key not in self.buffer or value < self.buffer[key]:
self.buffer[key] = value
def check_and_emit(self):
if is_exceed_limit_buffer_size(self.buffer):
for key, value in self.buffer.items():
emit key, value
self.buffer = {}
def close(self):
for key, value in self.buffer.items():
emit key, value
43. #...continue
def map(self, node_id, Node):
# send graph structure
emit node_id, Node
# get node value and add it to edge distance
dist = Node.get_value()
for nbr_node_id in Node.get_adjacency_list():
dist_to_nbr = Node.get_distance(node_id, nbr_node_id)
dist_nbr = dist + dist_to_nbr
check_and_put(nbr_node_id, dist_nbr)
check_and_emit()
48. # Shimmy trick
class ShortestPathReducer(Reducer):
def __init__(self):
P.open_graph_partition()
def emit_precede_node(self, node_id):
for pre_node_id, Node in P.read():
if node_id == pre_node_id:
return Node
else:
emit pre_node_id, Node
49. #(...continue)
def reduce(node_id, dist_list):
Node = self.emit_precede_node(node_id)
min_dist = sys.maxint
for dist in dist_list:
if dist < min_dist:
min_dist = dist
Node.set_value(min_dist)
emit node_id, Node
51. +∞ B 1 +∞
E
5 1
0 +∞ 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
52. +∞ B 1 +∞
E
5 1
0 +∞ 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
53. +∞ B 1 +∞
E
5 1
0 +∞ 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
1
54. 5 1 +∞
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
2
55. 5 1 +∞
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
+∞ C 5 F +∞
2
56. 4 1 6
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
6 C 5 F 5
3
57. 4 1 6
B E
5 1
0 3 4 +∞
A D G
3
3 2
4
6 C 5 F 5
3
58. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
4
59. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
4
60. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
5
61. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
5
62. 4 1 5
B E
5 1
0 3 4 9
A D G
3
3 2
4
6 C 5 F 5
end
63. class ShortestPathVertex:
def compute(self, msgs):
min_dist = 0 if self.is_source() else sys.maxint;
# get values from all incoming edges.
for msg in msgs:
min_dist = min(min_dist, msg.get_value())
if min_dist < self.get_value():
# update current value(state).
" self.set_current_value(min_dist)
# send new value to outgoing edge.
out_edge_iterator = self.get_out_edge_iterator()
for out_edge in out_edge_iterator:
recipient =
out_edge.get_other_element(self.get_id())
self.send_massage(recipient.get_id(),
min_dist + out_edge.get_distance() )
self.vote_to_halt()
72. Science and Technology), South Korea edwardyoon@apache.org Science and Technology), South Korea
swseo@calab.kaist.ac.kr jaehong@calab.kaist.ac.kr
Seongwook Jin Jin-Soo Kim Seungryoul Maeng
Computer Science Division School of Information and Communication Computer Science Division
KAIST (Korea Advanced Institute of Sungkyunkwan University, South Korea KAIST (Korea Advanced Institute of
Science and Technology), South Korea jinsookim@skku.edu Science and Technology), South Korea
swjin@calab.kaist.ac.kr maeng@calab.kaist.ac.kr
Abstract—APPLICATION. Various scientific computations HAMA API
have become so complex, and thus computation tools play an HAMA Core HAMA Shell
important role. In this paper, we explore the state-of-the-art
framework providing high-level matrix computation primitives Computation Engine
with MapReduce through the case study approach, and demon- MapReduce BSP Dryad (Plugged In/Out)
strate these primitives with different computation engines to
show the performance and scalability. We believe the opportunity Zookeeper Distributed Locking
for using MapReduce in scientific computation is even more
promising than the success to date in the parallel systems
literature. HBase
Storage Systems
HDFS RDBMS
I. I NTRODUCTION File
As cloud computing environment emerges, Google has
Fig. 1. The overall architecture of HAMA.
introduced the MapReduce framework to accelerate parallel
http://wiki.apache.org/hama/Articles
and distributed computing on more than a thousand of in-
expensive machines. Google has shown that the MapReduce
framework is easy to use and provides massive scalability HAMA is a distributed framework on Hadoop for massive
with extensive fault tolerance [2]. Especially, MapReduce fits matrix and graph computations. HAMA aims at a power-
well with complex data-intensive computations such as high- ful tool for various scientific applications, providing basic
dimensional scientific simulation, machine learning, and data primitives for developers and researchers with simple APIs.
mining. Google and Yahoo! are known to operate dedicated HAMA is currently being incubated as one of the subprojects
clusters for MapReduce applications, each cluster consisting of Hadoop by the Apache Software Foundation [10].
of several thousands of nodes. One of typical MapReduce Figure 1 illustrates the overall architecture of HAMA.