Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Gurvinder Singh
spark_apps
Commits
94ad72ce
Commit
94ad72ce
authored
Jun 27, 2014
by
Sigmund Augdal
Browse files
Make a little command line util out of the netflow test app
parent
c655f949
Changes
1
Show whitespace changes
Inline
Side-by-side
pythonApp/netflowTest.py
View file @
94ad72ce
from
pyspark.conf
import
SparkConf
from
pyspark
import
SparkContext
import
argparse
DESCRIPTION
=
"Analyze netflow data"
conf
=
SparkConf
()
conf
.
setAppName
(
"Netflow test"
).
set
(
"spark.executor.memory"
,
"1g"
).
set
(
"spark.default.parallelism"
,
4
)
conf
.
setAppName
(
"Netflow test"
).
set
(
"spark.executor.memory"
,
"1g"
).
set
(
"spark.default.parallelism"
,
15
).
set
(
"spark.mesos.coarse"
,
"true"
)
sc
=
SparkContext
(
conf
=
conf
)
def
add
(
x
,
y
):
return
x
+
y
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
DESCRIPTION
)
parser
.
add_argument
(
'--input'
,
help
=
"Data file to read"
)
parser
.
add_argument
(
'--find-top-ports'
,
action
=
"store_true"
,
help
=
"Find top ports"
)
parser
.
add_argument
(
'--find-top-ssh-clients'
,
action
=
"store_true"
,
help
=
"Find addresses of most active ssh clients"
)
#path = 'hdfs://daas/user/hdfs/trd_gw1_12_01_normalized.csv'
path
=
'hdfs://daas/user/hdfs/trd_gw1_12_normalized.csv/*'
csv
=
sc
.
textFile
(
path
).
map
(
lambda
x
:
x
.
split
(
","
)).
cache
()
return
parser
.
parse_args
()
def
top_ips
(
csv
,
num
=
10
):
def
add
(
x
,
y
):
return
x
+
y
def
top_ips
(
csv
,
which
=
"both"
,
num
=
10
):
if
which
==
"both"
:
ips
=
csv
.
flatMap
(
lambda
x
:
x
[
1
:
3
])
elif
which
==
"client"
:
ips
=
csv
.
map
(
lambda
x
:
x
[
1
])
elif
which
==
"server"
:
ips
=
csv
.
map
(
lambda
x
:
x
[
2
])
ip_count
=
ips
.
map
(
lambda
x
:
(
x
,
1
)).
reduceByKey
(
add
)
return
ip_count
.
map
(
lambda
x
:
(
x
[
1
],
x
[
0
])).
sortByKey
(
False
).
take
(
num
)
...
...
@@ -26,12 +38,26 @@ def top_ports(csv, num=10):
port_count
=
ports
.
map
(
lambda
x
:
(
x
,
1
)).
reduceByKey
(
add
)
return
port_count
.
map
(
lambda
x
:
(
x
[
1
],
x
[
0
])).
sortByKey
(
False
).
take
(
num
)
# print "Finding top ports"
# top = top_ports(csv)
# print "Port Count"
# for count, port in top:
# print port, count
print
"Finding active ssh ips"
ssh_ips
=
csv
.
filter
(
lambda
x
:
x
[
3
]
==
'22'
)
print
top_ips
(
ssh_ips
,
15
)
opts
=
parse_args
()
csv
=
sc
.
textFile
(
opts
.
input
).
map
(
lambda
x
:
x
.
split
(
","
))
if
opts
.
find_top_ports
:
print
"Finding top ports"
top
=
top_ports
(
csv
)
print
"
\n\n
"
print
"Top ports:"
print
"{:>6} {:>12}"
.
format
(
"Port"
,
"Count"
)
for
count
,
port
in
top
:
print
"{:>6} {:>12}"
.
format
(
port
,
count
)
if
opts
.
find_top_ssh_clients
:
print
"Finding active ssh ips"
ssh_ips
=
csv
.
filter
(
lambda
x
:
x
[
3
]
==
'22'
)
top_ssh_clients
=
top_ips
(
ssh_ips
,
"client"
,
15
)
print
"
\n\n
"
print
"Top addresses involved in ssh traffic"
print
"{:>15} {:>9}"
.
format
(
"Address"
,
"Count"
)
for
count
,
address
in
top_ssh_clients
:
print
"{:>15} {:>9}"
.
format
(
address
,
count
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment