scala> val data = sc.textFile(“hdfs://path/to/file”)
scala> data.foreach(println) // print all lines from file
scala> def myPrint (a: String) : Unit = {println(a))
scala> data.foreach(a => myPrint(a)) // prints all lines from file using myPrint function
scala> case class EmailRow(row:String) // create class for row
scala> val df=data.map(x => EmailRow(x) ).toDF() // Create dataframe
// show dataframe
scala> df.show()
scala> df.select(“row”).show()
df.foreach(println)
// Create unique id column for dataset
scala> import org.apache.spark.sql.functions.monotonicallyIncreasingId
scala> val newDf = df.withColumn(“id”, monotonicallyIncreasingId) // adds a new columnt at the end of the current dataset
scala> val ds2 = newDf.select(“id”,”row”) // now id is the first columnt
scala> ds2.select(“id”, “row”).where(df(“row”).contains(“X-“)).show() //filter out smth and show it
scala> ds2.count() // how many lines do I have in my dataset
val text_file = sc.textFile(“hdfs://bigdata21.webmedia.int:8020/user/margusja/titanic_test.csv”)
//text_file.map(_.length).collect
//text_file.flatMap(_.split(“,”)).collect
// word (as a key), 1
text_file.flatMap(_.split(“,”)).map((_,1)).reduceByKey(_+_)
case class Person(name: String, age: String)
val people = text_file.map(_.split(“,”)).map(p => Person(p(2), p(5))).toDS().toDF()
// Age is String and contains empty fields. Lets filter out numerical values
people.filter($”age” > 0).select(people(“age”).cast(“int”)).show()
// lets take avarage of people age
people.filter($”age” > 0).select(avg((people(“age”).cast(“int”)))).show()