public class SimpleApp {
public static void main(String[] args) {
String logFile = ""; // Should be some file on your system
JavaSparkContext sc = new JavaSparkContext("local", "Simple App",
new String[]{"target/simple-project-1.0.jar"});
JavaRDD<String> logData = sc.textFile(logFile).cache();
long numAs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) { return s.contains("a"); }
long numBs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) { return s.contains("b"); }
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
from pyspark import SparkContext
sc = SparkContext(appName='test')
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
print(distData.reduce(lambda a, b: a + b))
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext(appName='testSQL')
sqlCtx = SQLContext(sc)
lines = sc.textFile("hdfs://daas/user/hdfs/trd_gw1_12_01_normalized.csv")
parts = l: l.split(","))
records = p: {"date": p[0], "src_ip": p[1], "dest_ip": p[2], "port": int(p[3])})
recordsTable = sqlCtx.inferSchema(records)
http = sqlCtx.sql("SELECT count(*) FROM records WHERE port <= 80)")
