Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Gurvinder Singh
spark_apps
Commits
c655f949
Commit
c655f949
authored
Jun 27, 2014
by
Sigmund Augdal
Browse files
Test netflow analysis app
parent
6524f2c8
Changes
1
Show whitespace changes
Inline
Side-by-side
pythonApp/netflowTest.py
0 → 100644
View file @
c655f949
from
pyspark.conf
import
SparkConf
from
pyspark
import
SparkContext
conf
=
SparkConf
()
conf
.
setAppName
(
"Netflow test"
).
set
(
"spark.executor.memory"
,
"1g"
).
set
(
"spark.default.parallelism"
,
4
)
sc
=
SparkContext
(
conf
=
conf
)
def
add
(
x
,
y
):
return
x
+
y
#path = 'hdfs://daas/user/hdfs/trd_gw1_12_01_normalized.csv'
path
=
'hdfs://daas/user/hdfs/trd_gw1_12_normalized.csv/*'
csv
=
sc
.
textFile
(
path
).
map
(
lambda
x
:
x
.
split
(
","
)).
cache
()
def
top_ips
(
csv
,
num
=
10
):
ips
=
csv
.
flatMap
(
lambda
x
:
x
[
1
:
3
])
ip_count
=
ips
.
map
(
lambda
x
:
(
x
,
1
)).
reduceByKey
(
add
)
return
ip_count
.
map
(
lambda
x
:
(
x
[
1
],
x
[
0
])).
sortByKey
(
False
).
take
(
num
)
def
top_ports
(
csv
,
num
=
10
):
ports
=
csv
.
map
(
lambda
x
:
x
[
3
])
port_count
=
ports
.
map
(
lambda
x
:
(
x
,
1
)).
reduceByKey
(
add
)
return
port_count
.
map
(
lambda
x
:
(
x
[
1
],
x
[
0
])).
sortByKey
(
False
).
take
(
num
)
# print "Finding top ports"
# top = top_ports(csv)
# print "Port Count"
# for count, port in top:
# print port, count
print
"Finding active ssh ips"
ssh_ips
=
csv
.
filter
(
lambda
x
:
x
[
3
]
==
'22'
)
print
top_ips
(
ssh_ips
,
15
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment