Commit 3e8434ea authored by Sigmund Augdal's avatar Sigmund Augdal

Replace two maps and a union with a flatMap. Seems to be faster

parent 0ac1f5ac
......@@ -31,7 +31,9 @@ def ports_count_by_ip3(csv):
def ports_count_by_ip(csv):
srcs = csv.map(lambda x: ((x[DEST_PORT], x[SRC_IP]), 1))
dsts = csv.map(lambda x: ((x[DEST_PORT], x[DEST_IP]), 1))
ips = srcs.union(dsts).reduceByKey(add)
# srcs = csv.map(lambda x: ((x[DEST_PORT], x[SRC_IP]), 1))
# dsts = csv.map(lambda x: ((x[DEST_PORT], x[DEST_IP]), 1))
# ips = srcs.union(dsts).reduceByKey(add)
ips = csv.flatMap(lambda x: (((x[DEST_PORT], x[DEST_IP]), 1),
((x[DEST_PORT], x[SRC_IP]), 1))).reduceByKey(add)
return ips.map(lambda x: (x[1], x[0])).sortByKey(False).take(20)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment