Commit 94ad72ce authored by Sigmund Augdal's avatar Sigmund Augdal

Make a little command line util out of the netflow test app

parent c655f949
from pyspark.conf import SparkConf from pyspark.conf import SparkConf
from pyspark import SparkContext from pyspark import SparkContext
import argparse
DESCRIPTION = "Analyze netflow data"
conf = SparkConf() conf = SparkConf()
conf.setAppName("Netflow test").set("spark.executor.memory", "1g").set("spark.default.parallelism", 4) conf.setAppName("Netflow test").set("spark.executor.memory", "1g").set("spark.default.parallelism", 15).set("spark.mesos.coarse", "true")
sc = SparkContext(conf=conf) sc = SparkContext(conf=conf)
def add(x, y): def parse_args():
return x + y parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument('--input', help="Data file to read")
parser.add_argument('--find-top-ports', action="store_true", help="Find top ports")
parser.add_argument('--find-top-ssh-clients', action="store_true", help="Find addresses of most active ssh clients")
#path = 'hdfs://daas/user/hdfs/trd_gw1_12_01_normalized.csv' return parser.parse_args()
path = 'hdfs://daas/user/hdfs/trd_gw1_12_normalized.csv/*'
csv = sc.textFile(path).map(lambda x: x.split(",")).cache()
def top_ips(csv, num=10): def add(x, y):
ips = csv.flatMap(lambda x: x[1:3]) return x + y
def top_ips(csv, which="both", num=10):
if which == "both":
ips = csv.flatMap(lambda x: x[1:3])
elif which == "client":
ips = csv.map(lambda x: x[1])
elif which == "server":
ips = csv.map(lambda x: x[2])
ip_count = ips.map(lambda x: (x, 1)).reduceByKey(add) ip_count = ips.map(lambda x: (x, 1)).reduceByKey(add)
return ip_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(num) return ip_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(num)
...@@ -26,12 +38,26 @@ def top_ports(csv, num=10): ...@@ -26,12 +38,26 @@ def top_ports(csv, num=10):
port_count = ports.map(lambda x: (x, 1)).reduceByKey(add) port_count = ports.map(lambda x: (x, 1)).reduceByKey(add)
return port_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(num) return port_count.map(lambda x: (x[1], x[0])).sortByKey(False).take(num)
# print "Finding top ports"
# top = top_ports(csv)
# print "Port Count"
# for count, port in top:
# print port, count
print "Finding active ssh ips" opts = parse_args()
ssh_ips = csv.filter(lambda x: x[3] == '22')
print top_ips(ssh_ips, 15) csv = sc.textFile(opts.input).map(lambda x: x.split(","))
if opts.find_top_ports:
print "Finding top ports"
top = top_ports(csv)
print "\n\n"
print "Top ports:"
print "{:>6} {:>12}".format("Port", "Count")
for count, port in top:
print "{:>6} {:>12}".format(port, count)
if opts.find_top_ssh_clients:
print "Finding active ssh ips"
ssh_ips = csv.filter(lambda x: x[3] == '22')
top_ssh_clients = top_ips(ssh_ips, "client", 15)
print "\n\n"
print "Top addresses involved in ssh traffic"
print "{:>15} {:>9}".format("Address", "Count")
for count, address in top_ssh_clients:
print "{:>15} {:>9}".format(address, count)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment