Commit 94ad72ce authored by Sigmund Augdal's avatar Sigmund Augdal

Make a little command line util out of the netflow test app

parent c655f949
from pyspark.conf import SparkConf
from pyspark import SparkContext
import argparse
DESCRIPTION = "Analyze netflow data"
conf = SparkConf()
conf.setAppName("Netflow test").set("spark.executor.memory", "1g").set("spark.default.parallelism", 4)
conf.setAppName("Netflow test").set("spark.executor.memory", "1g").set("spark.default.parallelism", 15).set("spark.mesos.coarse", "true")
sc = SparkContext(conf=conf)
def add(x, y):
return x + y
def parse_args():
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument('--input', help="Data file to read")
parser.add_argument('--find-top-ports', action="store_true", help="Find top ports")
parser.add_argument('--find-top-ssh-clients', action="store_true", help="Find addresses of most active ssh clients")
#path = 'hdfs://daas/user/hdfs/trd_gw1_12_01_normalized.csv'
path = 'hdfs://daas/user/hdfs/trd_gw1_12_normalized.csv/*'
csv = sc.textFile(path).map(lambda x: x.split(",")).cache()
return parser.parse_args()
def top_ips(csv, num=10):
ips = csv.flatMap(lambda x: x[1:3])
def add(x, y):
return x + y
def top_ips(csv, which="both", num=10):
if which == "both":
ips = csv.flatMap(lambda x: x[1:3])
elif which == "client":
ips = csv.map(lambda x: x[1])
elif which == "server":
ips = csv.map(lambda x: x[2])
ip_count = ips.map(lambda x: (x, 1)).reduceByKey(add)
return ip_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(num)
......@@ -26,12 +38,26 @@ def top_ports(csv, num=10):
port_count = ports.map(lambda x: (x, 1)).reduceByKey(add)
return port_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(num)
# print "Finding top ports"
# top = top_ports(csv)
# print "Port Count"
# for count, port in top:
# print port, count
print "Finding active ssh ips"
ssh_ips = csv.filter(lambda x: x[3] == '22')
print top_ips(ssh_ips, 15)
opts = parse_args()
csv = sc.textFile(opts.input).map(lambda x: x.split(","))
if opts.find_top_ports:
print "Finding top ports"
top = top_ports(csv)
print "\n\n"
print "Top ports:"
print "{:>6} {:>12}".format("Port", "Count")
for count, port in top:
print "{:>6} {:>12}".format(port, count)
if opts.find_top_ssh_clients:
print "Finding active ssh ips"
ssh_ips = csv.filter(lambda x: x[3] == '22')
top_ssh_clients = top_ips(ssh_ips, "client", 15)
print "\n\n"
print "Top addresses involved in ssh traffic"
print "{:>15} {:>9}".format("Address", "Count")
for count, address in top_ssh_clients:
print "{:>15} {:>9}".format(address, count)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment