SQL

Spark SQL:-

https://github.com/rklick-solutions/spark-tutorial/wiki/Spark-SQL#introduction
https://github.com/rklick-solutions/spark-tutorial/wiki/Spark-Core
http://www.waitingforcode.com/apache-spark-sql

database --> ndulam
http://hadoop-makeitsimple.blogspot.com/2016/05/custom-partitioner-in-spark.html
https://acadgild.com/blog/partitioning-in-spark/
http://timepasstechies.com/category/programming/data-analytics/spark/
https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala

import org.apache.spark.HashPartitioner
import org.apache.spark.RangePartitioner
val temp = ordersRdd.flatMap(x=>x.split(",")).map(word=>(word,1))
val temp2 = temp.partitionBy(new RangePartitioner(10,temp))


val ordersRdd = sc.textFile("/user/ndulam/retail/ordersdir/orders");
val productsRdd = sc.textFile("/user/ndulam/retail/productsdir/products").filter(x=> x.split(",").length==6)
val order_itemsRdd = sc.textFile("/user/ndulam/retail/order_itemsdir/order_items")
val departmentsRdd = sc.textFile("/user/ndulam/retail/departmentsdir/departments")
val customersRdd = sc.textFile("/user/ndulam/retail/customersdir/customers")
val categoriesRdd = sc.textFile("/user/ndulam/retail/categoriesdir/categories")

case class category(category_id:Int,category_department_id:Int,category_name:String)
case class customers(customer_id:Int,customer_fname:String,customer_lname:String,customer_email:String,customer_password:String,customer_street:String,customer_city:String,customer_state:String,customer_zipcode:String)
case class department(department_id:Int,department_name:String)
case class order_items(order_item_id:Int,order_item_order_id:Int,order_item_product_id:Int,order_item_quantity:Int,order_item_subtotal:Float,order_item_product_price:Float)
case class orders(order_id:Int,order_date:String,order_customer_id:Int,order_status:String)
case class products(product_id:Int,product_category_id:Int,product_name:String,roduct_description:String,product_price:Float,product_image:String)


val ordersdf = ordersRdd.map(line=>line.split(",")).map(tp=>orders(tp(0).toInt,tp(1),tp(2).toInt,tp(3))).toDF
val productsdf = productsRdd.map(line=>line.split(",")).map( tp=>products(tp(0).toInt,tp(1).toInt,tp(2),tp(3),tp(4).toFloat,tp(5))).toDF
val orderitemsdf = order_itemsRdd.map(line=>line.split(",")).map( tp=> order_items(tp(0).toInt,tp(1).toInt,tp(2).toInt,tp(3).toInt,tp(4).toFloat,tp(5).toFloat)).toDF
val departmentsdf =  departmentsRdd.map(line=>line.split(",")).map(tp=> department(tp(0).toInt,tp(1))).toDF
val customerdf = customersRdd.map(line=>line.split(",")).map(tp=>customers(tp(0).toInt,tp(1),tp(2),tp(3),tp(4),tp(5),tp(6),tp(7),tp(8)) ).toDF
val categorydf = categoriesRdd.map(line=>line.split(",")).map(tp=>category(tp(0).toInt,tp(1).toInt,tp(2))).toDF

val ordersds = ordersdf.as[orders]
val productsds = productsdf.as[products]
val orderitemsds = orderitemsdf.as[order_items]
val departmentsds = departmentsdf.as[department]
val customerds = customerdf.as[customers]
val categoryds = categorydf.as[category]


val ordersdf = sqlContext.sql("select * from ndulam.orders")
val productsdf = sqlContext.sql("select * from ndulam.products")
val orderitemsdf = sqlContext.sql("select * from ndulam.order_items")
val departmentsdf =  sqlContext.sql("select * from ndulam.department")
val customerdf = sqlContext.sql("select * from ndulam.customers")
val categorydf = sqlContext.sql("select * from ndulam.category")


ordersdf.write.format("parquet").save("/user/ndulam/retail/parquet/orders.parquet")
productsdf.write.format("parquet").save("/user/ndulam/retail/parquet/products.parquet")
orderitemsdf.write.format("parquet").save("/user/ndulam/retail/parquet/orderitems.parquet")
departmentsdf.write.format("parquet").save("/user/ndulam/retail/parquet/departments.parquet")
customerdf.write.format("parquet").save("/user/ndulam/retail/parquet/customer.parquet")
categorydf.write.format("parquet").save("/user/ndulam/retail/parquet/category.parquet")


groupBy:
ordersdf.groupBy("order_status").count.show
ordersdf.groupBy("order_status").sum("order_id")
ordersdf.groupBy("order_status").max("order_id").show
ordersdf.groupBy("order_status").avg("order_id").show
ordersdf.groupBy("order_status").agg(sum("order_id"),avg("order_id")).show
ordersdf.groupBy("order_status").agg(sum("order_id"),avg("order_id")).show // using agg we can apply multiple aggregate functions(avg,min,max,sum) on different columns

We can apply above aggregate functions without grouping also as below
ordersdf.select(avg("order_id")).show
ordersdf.agg(avg("order_id")).show

pivot:
--> https://databricks.com/blog/2016/02/09/reshaping-data-with-pivot-in-apache-spark.html
--> https://svds.com/pivoting-data-in-sparksql/

-->val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/user/ndulam/cars/mpg.csv")

case class milage(ser:Int,manufacturer:String,model:String,displ:Float,year:Int,cyl:Int,trans:String,drv:String,cty:Int,hwy:Int,fl:String,carclass:String)
val file = sc.textFile("/user/ndulam/cars/mpg.csv")
val header = file.first
val milagerdd = file.filter(line=>line!=header)
val milagedf = milagerdd.map(_.split(",")).map(arr=>milage(arr(0).toInt,arr(1),arr(2),arr(3).toFloat,arr(4).toInt,arr(5).toInt,arr(6),arr(7),arr(8).toInt,arr(9).toInt,arr(10),arr(11)))
val milageds = milagedf.toDS

milagedf.first:
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact

milagedf.groupBy("carclass").pivot("year").agg(min("cty"),max("cty"),min("cty"),max("cty")).show
+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|  carclass|1999_min(cty)|1999_max(cty)|1999_min(cty)|1999_max(cty)|2008_min(cty)|2008_max(cty)|2008_min(cty)|2008_max(cty)|
+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|       suv|           11|           18|           11|           18|            9|           20|            9|           20|
|   2seater|           15|           16|           15|           16|           15|           16|           15|           16|
|    pickup|           11|           16|           11|           16|            9|           17|            9|           17|
|   midsize|           15|           21|           15|           21|           16|           23|           16|           23|
|   compact|           15|           33|           15|           33|           15|           28|           15|           28|
|   minivan|           15|           18|           15|           18|           11|           17|           11|           17|
|subcompact|           15|           35|           15|           35|           14|           26|           14|           26|
+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
s

na - https://stackoverflow.com/questions/4862178/remove-rows-with-nas-missing-values-in-data-frame


filter:
ordersdf.filter("order_status='PENDING'").count
ordersdf.filter(ordersdf("order_status")='PENDING').count
ordersdf.filter("order_id>68880").show
ordersdf.filter(ordersdf("order_id")>68880).show
ordersdf.filter(ordersdf("order_status")==="PENDING").show


sort:
ordersdf.sort(ordersdf("order_id").desc).show or ordersdf.sort($"order_id".desc).show
ordersdf.sort(ordersdf("order_id")).show or ordersdf.sort($"order_id").show

orderBy:
ordersdf.orderBy(ordersdf("order_id").desc).show  or  ordersdf.orderBy(desc("order_id")).show
ordersdf.orderBy(ordersdf("order_id")).show


head/limit :- Returns a new DataFrame by taking the first n rows.The difference between this function and head is that head returns an array while limit returns a new DataFrame.
ordersdf.head(2) ==> Array[org.apache.spark.sql.Row]
ordersdf.limi(2)==> org.apache.spark.sql.DataFrame

unionALL/intestion/except
val pendingdf = ordersdf.filter(ordersdf("order_status")==="PENDING")
ordersdf.unionAll(pendingdf)
ordersdf.intersect(penidngdf)
ordersdf.except(pendingdf)

withColumn:
Returns a new DataFrame by adding a column or replacing the existing column that has the same name. Please look at below examples carefully. one is adding columns and other is replacing

val Quantitydecider:(Int=>String) = (quantity:Int) => {
if(quantity<5) "less than 5" else "Eqaul or More"
}

val quantityfun = udf(Quantitydecider)

orderitemsdf.withColumn("order_item_quantity",quantityfun(col("order_item_quantity"))).printSchema
root
 |-- order_item_id: integer (nullable = false)
 |-- order_item_order_id: integer (nullable = false)
 |-- order_item_product_id: integer (nullable = false)
 |-- order_item_quantity: string (nullable = true)
 |-- order_item_subtotal: float (nullable = false)
 |-- order_item_product_price: float (nullable = false)

orderitemsdf.withColumn("Decider",quantityfun(col("order_item_quantity"))).printSchema
root
 |-- order_item_id: integer (nullable = false)
 |-- order_item_order_id: integer (nullable = false)
 |-- order_item_product_id: integer (nullable = false)
 |-- order_item_quantity: integer (nullable = false)
 |-- order_item_subtotal: float (nullable = false)
 |-- order_item_product_price: float (nullable = false)
 |-- Decider: string (nullable = true)

withColumnRenamed: Returns a new DataFrame with a column renamed.
orderitemsdf.withColumnRenamed("order_item_quantity","quantity")
org.apache.spark.sql.DataFrame = [order_item_id: int, order_item_order_id: int, order_item_product_id: int, quantity: int, order_item_subtotal: float, order_item_product_price: float]

drop():
orderitemsdf.drop("order_item_quantity")
res52: org.apache.spark.sql.DataFrame = [order_item_id: int, order_item_order_id: int, order_item_product_id: int, order_item_subtotal: float, order_item_product_price: float]

dropDuplicates:
Returns a new DataFrame that contains only the unique rows from this DataFrame. This is an alias for distinct.
ordersdf.unionAll(pendingdf).dropDuplicates().count

describe:
describe returns a DataFrame containing information such as number of non-null entries (count),mean, standard deviation, and minimum and maximum value for each numerical column.
ordersdf.describe("order_id").show
+-------+------------------+
|summary|          order_id|
+-------+------------------+
|  count|             68883|
|   mean|           34442.0|
| stddev|19884.953633337947|
|    min|                 1|
|    max|             68883|
+-------+------------------+


To check logical/physical execution plan as below
 co.explain()
 co.queryExecution.executedPlan

Datasets operations:
select:
ordersds.select($"order_id".as[String])

Joining:
ordersds.joinWith(orderitemsds,$"orderitemsds.order_item_order_id".as[Int]===$"ordersds.order_id".as[Int])
ordersds.joinWith(orderitemsds,$"orderitemsds.order_item_order_id"===$"ordersds.order_id")

val result = ordersds.joinWith(orderitemsds,$"orderitemsds.order_item_order_id".as[Int]===$"ordersds.order_id".as[Int])
res58: org.apache.spark.sql.Dataset[(orders, order_items)] = [_1: struct<order_id:int,order_date:string,order_customer_id:int,order_status:string>, _2: struct<order_item_id:int,order_item_order_id:int,order_item_product_id:int,order_item_quantity:int,order_item_subtotal:float,order_item_product_price:float>]
select on joined data:
result.map(r=>r._1.order_date)

groupBy:
ordersds.groupBy(_.order_status).agg(sum($"order_id").as[Double],avg($"order_customer_id").as[Double]).collect


Broadcast Join:-
By default spark will perform broadcast join if one of the tables being joined are less than 10MB(10485760, property spark.sql.autoBroadcastJoinThreshold )

if you wanted to explicitly specify broadcast join(cases like table size is more than 10 MB), use below syntax to broadcast join
import org.apache.spark.sql.functions.broadcast
val bc = broadcast(customerdf)
val co = ordersdf.join(bc, bc("customer_id")=== ordersdf("order_customer_id"))

If you wanted to disable broadcast join set below property to -1
sqlContext.sql("SET spark.sql.autoBroadcastJoinThreshold = -1")

CAST(regexp_replace(regexp_replace(TRIM(col2),'\\.',''),',','.') as decimal(12,2))


1. find customer wise shoping total?
val goi = orderitemsdf.groupBy("order_item_order_id").agg(sum("order_item_subtotal") as "sumtotal")
val co = customerdf.join(ordersdf, customerdf("customer_id")=== ordersdf("order_customer_id")).select("customer_fname","order_id")
val result = goi.join(co,goi("order_item_order_id")===co("order_id")).select("customer_fname","sumtotal")


2. find category wise orders total?
val oigps = orderitemsdf.select("order_item_product_id","order_item_subtotal").groupBy("order_item_product_id").agg(sum("order_item_subtotal") as "Totalcost")
val jsp = oigps.join(productsdf,oigps("order_item_product_id")===productsdf("product_id")).select("product_category_id","Totalcost")
val jcs = jsp.join(categorydf,jsp("product_category_id")===categorydf("category_id")).select("category_name","Totalcost")

3. find city wise sales?

4. find each product product_price?

5. total orderitems order wise?


addent -password -p ndulam@HDPRD002.IT.domain.COM  -k 1 -e rc4-hmac


Best Practices:
1. After repartitioning using partitioBy persist to avoid perform perform shuffling each time.
example:-  rdd.partitioBy(RangePartitioner/HashPartitioner).persist
2. performing join on pre-partitioned rdd(using partitionBy) avoid the shuffling.


create external table category(category_id INT,category_department_id INT,category_name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/ndulam/retail/categoriesdir';
create external table customers(customer_id INT,customer_fname STRING,customer_lname STRING,customer_email STRING,customer_password STRING,customer_street STRING,customer_city STRING,customer_state STRING,customer_zipcode STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/ndulam/retail/customersdir';
create external table department(department_id INT,department_name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/ndulam/retail/departmentsdir';
create external table order_items(order_item_id INT,order_item_order_id INT,order_item_product_id INT,order_item_quantity INT,order_item_subtotal FLOAT,order_item_product_price FLOAT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/ndulam/retail/order_itemsdir';
create external table orders(order_id INT,order_date STRING,order_customer_id INT,order_status STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/ndulam/retail/ordersdir';
create external table products(product_id INT,product_category_id INT,product_name STRING,roduct_description STRING,product_price INT,product_image STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE location '/user/ndulam/retail/productsdir';

ANALYZE TABLE category COMPUTE STATISTICS noscan;


Joins:-

orders	:	                                               customers:
id, "customers_id", "amount"		                        id, "login"
[1,1,50.0]		                                          [1,Customer_1]
[2,2,10.0]		                                          [2,Customer_2]
[3,2,10.0]		                                          [3,Customer_3]
[4,2,10.0]
[5,1000,19.0]


orders.join(customers,orders("customers_id") === customers("id"), "leftsemi").collect
[1,1,50.0]
[2,2,10.0]
[3,2,10.0]
[4,2,10.0])

orders.join(customers,orders("customers_id") === customers("id"), "leftouter").collect
[1,1,50.0,1,Customer_1]
[2,2,10.0,2,Customer_2]
[3,2,10.0,2,Customer_2]
[4,2,10.0,2,Customer_2]
[5,1000,19.0,null,null]


creating schema:

 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.types.{StructType,StructField,StringType};

 StructType(StructField(, Type, false),StructField(LAST_UPDATE_DATE, DateType, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(VENDOR_ID, StringType, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false),StructField(, Type, false)   )


val rdd1 = sc.makeRDD(Array(("A","Naresh"),("B","Aparna"),("C","Chitti")))

val rdd2 = sc.makeRDD(Array(("A","Sathaiah"),("C","Jyothi"),("D","Vinay"),("A","Dulam"),("B","Balagoni"),("C","Done")))

Array[(String, (Iterable[String], Iterable[String]))] = Array((A,(CompactBuffer(Naresh),CompactBuffer(Sathaiah, Dulam))),
(B,(CompactBuffer(Aparna),CompactBuffer(Balagoni))), (C,(CompactBuffer(Chitti, Done),CompactBuffer(Jyothi))),
 (D,(CompactBuffer(),CompactBuffer(Vinay))))

 Array((A,(CompactBuffer(Naresh),CompactBuffer(Sathaiah, Dulam))), (B,(CompactBuffer(Aparna),CompactBuffer(Balagoni))),
  (C,(CompactBuffer(Chitti),CompactBuffer(Jyothi, Done))), (D,(CompactBuffer(),CompactBuffer(Vinay))))

val result1 = result.map{ x =>
     val key = x._1
     val value = x._2
     val temp1 = value._1.toList
     val temp2 = value._2.toList
     import scala.collection.mutable.ArrayBuffer
     var names = ArrayBuffer[String]()
     for (t1 <- temp2 if t1.startsWith("D"))
     {
     names+= t1
     }
     (key,names)
     }


     file.mapPartitionsWithIndex((x,y)=>{
           val l = y.toList
           val len = l.length
           (x,len).map(z=>z).iterator

           }
           )


 import org.apache.spark.util.SizeEstimator
 println(SizeEstimator.estimate(file)) -- in bytes
The repartition algorithm does a full data shuffle and equally distributes the data among the partitions. It does not attempt to minimize data movement like the coalesce algorithm.

 file.mapPartitionsWithIndex{case (i,rows) => Iterator((i,rows.size))}

Example to iterate through Iterator:-

             val result1 = result.map{ x =>
             val key = x._1
             val value = x._2
             val temp1 = value._1.toList
             val temp2 = value._2.toList
             import scala.collection.mutable.ArrayBuffer
             var names = ArrayBuffer[String]()
             for (t1 <- temp2 if t1.startsWith("D")){
              names+= t1
              }
               (key,names)
               }


spark-submit --master yarn --driver-memory 20g --executor-memory 20g --num-executors 50 --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC -XX:+UseCompressedOops" --conf "spark.local.dir=/opt/data/stage01/app/temp" --conf "spark.worker.cleanup.enabled=true"  --conf "spark.broadcast.blockSize=16m"  --conf "spark.yarn.executor.memoryOverhead=5120" --conf spark.network.timeout=800 --conf spark.rpc.askTimeout=800 --conf spark.locality.wait=10s --conf spark.yarn.max.executor.failures=100 --conf spark.shuffle.manager=SORT --conf spark.shuffle.service.enabled=true --conf spark.shuffle.compress=true --conf spark.shuffle.spill.compress=true --class classname $APP_DIR/bin/ott-report-sumry-spark-1.0.jar

spark-submit --master yarn


spark-submit --class com.naresh.org.CustomerwiseTotal  --master yarn --deploy-mode cluster  SparkApp-1.0-SNAPSHOT.jar --num-executors 3 --executor-memory 3G --executor-cores 4

spark-submit --class com.naresh.org.CustomerwiseTotal  --master yarn --deploy-mode cluster  SparkApp-1.0-SNAPSHOT.jar
spark-submit --class com.naresh.org.CategorywiseTotal  --master yarn --deploy-mode cluster  SparkApp-1.0-SNAPSHOT.jar
spark-submit --class com.naresh.org.StreamApp --deploy-mode cluster SparkStreamingApps-0.0.1-SNAPSHOT.jar

spark-submit --class com.naresh.org.HBaseTest --master yarn --jars  /usr/hdp/current/hbase-client/lib/hbase-client.jar  --deploy-mode cluster SparkApp-1.0-SNAPSHOT.jar

spark-submit --class com.naresh.org.HBaseTest --master yarn --files /etc/hbase/conf/hbase-site.xml,/etc/hadoop/conf/core-site.xml,/etc/hadoop/conf/hdfs-site.xml,/home/ndulam/ndulam.keytab  --jars /usr/hdp/current/hbase-client/lib/hbase-client.jar,/usr/hdp/current/hbase-master/lib/hbase-server.jar,/usr/hdp/current/hbase-master/lib/htrace-core-3.1.0-incubating.jar,/usr/hdp/current/hbase-master/lib/metrics-core-2.2.0.jar  --deploy-mode client SparkApp-1.0-SNAPSHOT.jar  --principal ndulam@HDPRD002.IT.domain.COM --keytab /home/ndulam/ndulam.keytab


Streaming:

 java -cp "/Users/ndulam/kafka/libs/*":SparkStreamingApps-0.0.1-SNAPSHOT.jar com.naresh.org.KafkaCarProducer

 bin/kafka-console-consumer.sh  --topic cars --bootstrap-server localhost:9093
 bin/kafka-topics.sh  --zookeeper localhost:2181 --list
bin/kafka-console-consumer.sh --bootstrap-server localhost:9093 --topic fastcars

 spark-submit --class com.naresh.org.StreamApp FirstAppS-1.0-SNAPSHOT.jar

 spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 --jars  ~/.ivy2/jars/kafka-clients-0.10.2.1.jar --class com.naresh.org.AppStream ./target/SampleStreamApp-1.0-SNAPSHOT.jar

/Users/ndulam/Desktop/workspace/SparkTraining/SparkStreamingApps/target
java -cp "/Users/ndulam/kafka/libs/*":SparkStreamingApps-0.0.1-SNAPSHOT.jar com.naresh.org.KafkaCarProducer jeep

cd /Users/ndulam/Desktop/SparkStreamWorkspace/SampleStreamApp
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0  --class com.naresh.org.AppStream ./target/SampleStreamApp-1.0-SNAPSHOT.jar jeep
spark-submit --packages org.apache.spark:spark-streaming_2.11:2.2.1  --class com.naresh.org.AppStream ./target/SampleStreamApp-1.0-SNAPSHOT.jar jeep
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 --jars  $(echo ~/.ivy2/jars/*.jar | tr ' ' ',')  --class com.naresh.org.DiffUpdateStateByKeyAndMapWithState target/SampleStreamApp-1.0-SNAPSHOT.jar


Data Prepartion:

val names = List("Liam","Mason","Jacob","William","Ethan","James","Alexander","Michael","Benjamin","Elijah","Daniel","Aiden","Logan","Matthew","Lucas","Jackson","David","Oliver","Jayden","Joseph","Gabriel","Samuel","Carter","Anthony","John","Dylan","Luke","Henry","Andrew","Isaac","Christopher","Joshua","Wyatt","Sebastian","Owen","Caleb","Nathan","Ryan","Jack","Hunter","Levi","Christian","Jaxon","Julian","Landon","Grayson","Jonathan","Isaiah","Charles","Thomas","Aaron","Eli","Connor","Jeremiah","Cameron","Josiah","Adrian","Colton","Jordan","Brayden","Nicholas","Robert","Angel","Hudson","Lincoln","Evan","Dominic","Austin","Gavin","Nolan","Parker","Adam","Chase","Jace","Ian","Cooper","Easton","Kevin","Jose","Tyler","Brandon","Asher","Jaxson","Mateo","Jason","Ayden","Zachary","Carson","Xavier","Leo","Ezra","Bentley","Sawyer","Kayden","Blake","Nathaniel","Related","Post","BOY","NAMES","THAT","START","WITH","C","Ryder","Theodore","Elias","Tristan","Roman","Leonardo","Camden","Brody","Luis","Miles","Micah","Vincent","Justin","Greyson","Declan","Maxwell","Juan","Cole","Damian","Carlos","Max","Harrison","Weston","Brantley","Braxton","Axel","Diego","Abel","Wesley","Santiago","Jesus","Silas","Giovanni","Bryce","Jayce","Bryson","Alex","Everett","George","Eric","Ivan","Emmett","Kaiden","Ashton","Kingston","Jonah","Jameson","Kai","Maddox","Timothy","Ezekiel","Ryker","Emmanuel","Hayden","Antonio","Bennett","Steven","Richard","Jude","Luca","Edward","Joel","Victor","Miguel","Malachi","King","Patrick","Kaleb","Bryan","Alan","Marcus","Preston","Abraham","Calvin","Colin","Bradley","Jeremy","Kyle","Graham","Grant","Jesse","Kaden","Alejandro","Oscar","Jase","Karter","Maverick","Aidan","Tucker","Avery","Amir","Brian","Iker","Matteo","Caden","Zayden","Riley","August","Mark","Maximus","Brady","Kenneth","Paul","Jaden","Nicolas","Beau","Dean","Jake","Peter","Xander","Elliot","Finn","Derek","Sean","Cayden","Elliott","Jax","Jasper","Lorenzo","Omar","Beckett","Rowan","Gael","Corbin","Waylon","Myles","Tanner","Jorge","Javier","Zion","Andres","Charlie","Paxton","Emiliano","Brooks","Zane","Simon","Judah","Griffin","Cody","Gunner","Dawson","Israel","Rylan","Gage","Messiah","River","Kameron","Stephen","Francisco","Clayton","Zander","Chance","Eduardo","Spencer","Lukas","Damien","Dallas","Conner","Travis","Knox","Raymond","Peyton","Devin","Felix","Jayceon","Collin","Amari","Erick","Cash","Jaiden","Fernando","Cristian","Josue","Keegan","Garrett","Rhett","Ricardo","Martin","Reid","Seth","Andre","Cesar","Titus","Donovan","Manuel","Mario","Caiden","Adriel","Kyler","Milo","Archer","Jeffrey","Holden","Arthur","Karson","Rafael","Shane","Lane","Louis","Angelo","Remington","Troy","Emerson","Maximiliano","Hector","Emilio","Anderson","Trevor","Phoenix","Walter","Johnathan","Johnny","Edwin","Julius","Barrett","Leon","Tyson","Tobias","Edgar","Dominick","Marshall","Marco","Joaquin","Dante","Andy","Cruz","Ali","Finley","Dalton","Gideon","Reed","Enzo","Sergio","Jett","Thiago","Kyrie","Ronan","Cohen","Colt","Erik","Trenton","Jared","Walker","Landen","Alexis","Nash","Jaylen","Gregory","Emanuel","Killian","Allen","Atticus","Desmond","Shawn","Grady","Quinn","Frank","Fabian","Dakota","Roberto","Beckham","Major","Skyler","Nehemiah","Drew","Cade","Muhammad","Kendrick","Pedro","Orion","Aden","Kamden","Ruben","Zaiden","Clark","Noel","Porter","Solomon","Romeo","Rory","Malik","Daxton","Leland","Kash","Abram","Derrick","Kade","Gunnar","Prince","Brendan","Leonel","Kason","Braylon","Legend","Pablo","Jay","Adan","Jensen","Esteban","Kellan","Drake","Warren","Ismael","Ari","Russell","Bruce","Finnegan","Marcos","Jayson","Theo","Jaxton","Phillip","Dexter","Braylen","Armando","Braden","Corey","Kolton","Gerardo","Ace","Ellis","Malcolm","Tate","Zachariah","Chandler","Milan","Keith","Danny","Damon","Enrique","Jonas","Kane","Princeton","Hugo","Ronald","Philip","Ibrahim","Kayson","Maximilian","Lawson","Harvey","Albert","Donald","Raul","Franklin","Hendrix","Odin","Brennan","Jamison","Dillon","Brock","Landyn","Mohamed","Brycen","Deacon","Colby","Alec","Julio","Scott","Matias","Sullivan","Rodrigo","Cason","Taylor","Rocco","Nico","Royal","Pierce","Augustus","Raiden","Kasen","Benson","Moses","Cyrus","Raylan","Davis","Khalil","Moises","Conor","Nikolai","Alijah","Mathew","Keaton","Francis","Quentin","Ty","Jaime","Ronin","Kian","Lennox","Malakai","Atlas","Jerry","Ryland","Ahmed","Saul","Sterling","Dennis","Lawrence","Zayne","Bodhi","Arjun","Darius","Arlo","Eden","Tony","Dustin","Kellen","Chris","Mohammed","Nasir","Omari","Kieran","Nixon","Rhys","Armani","Arturo","Bowen","Frederick","Callen","Leonidas","Remy","Wade","Luka","Jakob","Winston","Justice","Alonzo","Curtis","Aarav","Gustavo","Royce","Asa","Gannon","Kyson","Hank","Izaiah","Roy","Raphael","Luciano","Hayes","Case","Darren","Mohammad","Otto","Layton","Isaias","Alberto","Jamari","Colten","Dax","Marvin","Casey","Moshe","Johan","Sam","Matthias","Larry","Trey","Devon","Trent","Mauricio","Mathias","Issac","Dorian","Gianni","Ahmad","Nikolas","Oakley","Uriel","Lewis","Randy","Cullen","Braydon","Ezequiel","Reece","Jimmy","Crosby","Soren","Uriah","Roger","Nathanael","Emmitt","Gary","Rayan","Ricky","Mitchell","Roland","Alfredo","Cannon","Jalen","Tatum","Kobe","Yusuf","Quinton","Korbin","Brayan","Joe","Byron","Ariel","Quincy","Carl","Kristopher","Alvin","Duke","Lance","London","Jasiah","Boston","Santino","Lennon","Deandre","Madden","Talon","Sylas","Orlando","Hamza","Bo","Aldo","Douglas","Tristen","Wilson","Maurice","Samson","Cayson","Bryant","Conrad","Dane","Julien","Sincere","Noe","Salvador","Nelson","Edison","Ramon","Lucian","Mekhi","Niko","Ayaan","Vihaan","Neil","Titan","Ernesto","Brentley","Lionel","Zayn","Dominik","Cassius","Rowen","Blaine","Sage","Kelvin","Jaxen","Memphis","Leonard","Abdullah","Jacoby","Allan","Jagger","Yahir","Forrest","Guillermo","Mack","Zechariah","Harley","Terry","Kylan","Fletcher","Rohan","Eddie","Bronson","Jefferson","Rayden","Terrance","Marc","Morgan","Valentino","Demetrius","Kristian","Hezekiah","Lee","Alessandro","Makai","Rex","Callum","Kamari","Casen","Tripp","Callan","Stanley","Toby","Elian","Langston","Melvin","Payton","Flynn","Jamir","Kyree","Aryan","Axton","Azariah","Branson","Reese","Adonis","Thaddeus","Zeke","Tommy","Blaze","Carmelo","Skylar","Arian","Bruno","Kaysen","Layne","Ray","Zain","Crew","Jedidiah","Rodney","Clay","Tomas","Alden","Jadiel","Harper","Ares","Cory","Brecken","Chaim","Nickolas","Kareem","Xzavier","Kaison","Alonso","Amos","Vicente","Samir","Yosef","Jamal","Jon","Bobby","Aron","Ben","Ford","Brodie","Cain","Finnley","Briggs","Davion","Kingsley","Brett","Wayne","Zackary","Apollo","Emery","Joziah","Lucca","Bentlee","Hassan","Westin","Joey","Vance","Marcelo","Axl","Jermaine","Chad","Gerald","Kole","Dash","Dayton","Lachlan","Shaun","Kody","Ronnie","Kolten","Marcel","Stetson","Willie","Jeffery","Brantlee","Elisha","Maxim","Kendall","Harry","Leandro","Aaden","Channing","Kohen","Yousef","Darian","Enoch","Mayson","Neymar","Giovani","Alfonso","Duncan","Anders","Braeden","Dwayne","Keagan","Felipe","Fisher","Stefan","Trace","Aydin","Anson","Clyde","Blaise","Canaan","Maxton","Alexzander","Billy","Harold","Baylor","Gordon","Rene","Terrence","Vincenzo","Kamdyn","Marlon","Castiel","Lamar","Augustine","Jamie","Eugene","Harlan","Kase","Miller","Van","Kolby","Sonny","Emory","Junior","Graysen","Heath","Rogelio","Will","Amare","Ameer","Camdyn","Jerome","Maison","Micheal","Cristiano","Giancarlo","Henrik","Lochlan","Bode","Camron","Houston","Otis","Hugh","Kannon","Konnor","Emmet","Kamryn","Maximo","Adrien","Cedric","Dariel","Landry","Leighton","Magnus","Draven","Javon","Marley","Zavier","Markus","Justus","Reyansh","Rudy","Santana","Misael","Abdiel","Davian","Zaire","Jordy","Reginald","Benton","Darwin","Franco","Jairo","Jonathon","Reuben","Urijah","Vivaan","Brent","Gauge","Vaughn","Coleman","Zaid","Terrell","Kenny","Brice","Lyric","Judson","Shiloh","Damari","Kalel","Braiden","Brenden","Coen","Denver","Javion","Thatcher","Rey","Dilan","Dimitri","Immanuel","Mustafa","Ulises","Alvaro","Dominique","Eliseo","Anakin","Craig","Dario","Santos","Grey","Ishaan","Jessie","Jonael","Alfred","Tyrone","Valentin","Jadon","Turner","Ignacio","Riaan","Rocky","Ephraim","Marquis","Musa","Keenan","Ridge","Chace","Kymani","Rodolfo","Darrell","Steve","Agustin","Jaziel","Boone","Cairo","Kashton","Rashad","Gibson","Jabari","Avi","Quintin","Seamus","Rolando","Sutton","Camilo","Triston","Yehuda","Cristopher","Davin","Ernest","Jamarion","Kamren","Salvatore","Anton","Aydan","Huxley","Jovani","Wilder","Bodie","Jordyn","Louie","Achilles","Kaeden","Kamron","Aarush","Deangelo","Robin","Yadiel","Yahya","Boden","Ean","Kye","Kylen","Todd","Truman","Chevy","Gilbert","Haiden","Brixton","Dangelo","Juelz","Osvaldo","Bishop","Freddy","Reagan","Frankie","Malaki","Camren","Deshawn","Jayvion","Leroy","Briar","Jaydon","Antoine")

import scala.util.Random
names(Random.nextInt(names.size))

val file = sc.textFile("/Users/nd2629/Downloads/us_postal_codes.csv")
val data = file.zipWithIndex.filter(_._2!=0)
data.map(x=>x._1)
val browser = List("IE","Chrome","Firefox","Safari")
val status = List("Y","N")

import  java.util.Calendar

val activity = data.map(x=>x._1).map(x=>userActivity(Random.nextInt(),names(Random.nextInt(names.size)),x.split(',')(1),x.split(',')(2) ,x.split(',')(4),browser(Random.nextInt(browser.size)),Calendar.getInstance().getTime(),status(Random.nextInt(status.size)),x.split(',')(5),x.split(',')(6),x.split(',')(0) ))

501,Holtsville,New York,NY,Suffolk,40.8154,-73.0451

case class userActivity(Id:Int, name:String, city:String, state:String, county:String, browser:String, time:java.util.Date, isActive:String, longitude:String, lattitude:String, zip:String)
> val activity = data.map(x=>x._1).map(x=>userActivity(Random.nextInt(),names(Random.nextInt(names.size)),x.split(',')(1),x.split(',')(2) ,x.split(',')(4),browser(Random.nextInt(browser.size)),Calendar.getInstance().getTime(),status(Random.nextInt(status.size)),x.split(',')(5),x.split(',')(6),x.split(',')(0) ))


val dataDS = Seq("""
{
"dcname": "dc-101",
"source": {
    "sensor-igauge": {
      "id": 10,
      "ip": "68.28.91.22",
      "description": "Sensor attached to the container ceilings",
      "temp":35,
      "c02_level": 1475,
      "geo": {"lat":38.00, "long":97.00}
    },
    "sensor-ipad": {
      "id": 13,
      "ip": "67.185.72.1",
      "description": "Sensor ipad attached to carbon cylinders",
      "temp": 34,
      "c02_level": 1370,
      "geo": {"lat":47.41, "long":-122.00}
    },
    "sensor-inest": {
      "id": 8,
      "ip": "208.109.163.218",
      "description": "Sensor attached to the factory ceilings",
      "temp": 40,
      "c02_level": 1346,
      "geo": {"lat":33.61, "long":-111.89}
    },
    "sensor-istick": {
      "id": 5,
      "ip": "204.116.105.67",
      "description": "Sensor embedded in exhaust pipes in the ceilings",
      "temp": 40,
      "c02_level": 1574,
      "geo": {"lat":35.93, "long":-85.46}
    }
  }
}""").toDS()

val dataDS = Seq("""{"dcname":"dc-102","source":{"sensor-ipad":{"id":5,"ip":"68.28.91.25","description":"Sensor embedded in exhaust pipes in the ceilings","temp":62,"c02_level":666,"geo":{"lat":72.0,"longi":52.0}},"sensor-igauge":{"id":2,"ip":"67.185.72.2","description":"Sensor ipad attached to carbon cylinders","temp":39,"c02_level":438,"geo":{"lat":40.0,"longi":76.0}},"sensor-inest":{"id":8,"ip":"204.116.105.68","description":"Sensor ipad attached to carbon cylinders","temp":24,"c02_level":942,"geo":{"lat":44.0,"longi":26.0}},"sensor-istick":{"id":10,"ip":"204.116.105.10","description":"Sensor ipad attached to carbon cylinders","temp":5,"c02_level":1748,"geo":{"lat":35.0,"longi":51.0}}}}""").toDS()

import org.apache.spark.sql.types._
val schema = new StructType().add("dcname", StringType).add("source",
  MapType(
    StringType,
    new StructType()
    .add("id", LongType)
    .add("ip", StringType)
    .add("description",StringType)
    .add("temp", LongType)
    .add("c02_level", LongType)
    .add("geo",
      new StructType()
      .add("lat", DoubleType)
      .add("longi", DoubleType))))

val schema = new StructType().add("dc_id", StringType).add("source",
    MapType(
      StringType,
      new StructType()
      .add("description", StringType)
      .add("ip", StringType)
      .add("id", LongType)
      .add("temp", LongType)
      .add("c02_level", LongType)
      .add("geo",
         new StructType()
          .add("lat", DoubleType)
          .add("long", DoubleType))))


val df = spark.read.schema(schema).json(dataDS.rdd)
val explodedDF = df.select($"dcname", explode($"source"))


case class DeviceAlert(dcname: String, deviceType:String, deviceId:Long, ip:String,  temp:Long, c02_level: Long, lat: Double, lon: Double)

val notifydevicesDS = explodedDF.select( $"dcname" as "dcname",
                        $"key" as "deviceType",
                        'value.getItem("ip") as 'ip,
                        'value.getItem("id") as 'deviceId,
                        'value.getItem("c02_level") as 'c02_level,
                        'value.getItem("temp") as 'temp,
                        'value.getItem("geo").getItem("lat") as 'lat,
                        'value.getItem("geo").getItem("longi") as 'lon).as[DeviceAlert]


Array(
  DeviceAlert(dc-101,sensor-igauge,68.28.91.22,10,35,1475,38.0,97.0),
  DeviceAlert(dc-101,sensor-ipad,67.185.72.1,13,34,1370,47.41,-122.0),
  DeviceAlert(dc-101,sensor-inest,208.109.163.218,8,40,1346,33.61,-111.89),
  DeviceAlert(dc-101,sensor-istick,204.116.105.67,5,40,1574,35.93,-85.46))