Streaming XML example

val xml2="""<people>  <person>    <age born="1990-02-24">25</age>  </person>  <person>    <age born="1985-01-01">30</age>  </person>  <person>    <age born="1980-01-01">30</age>  </person></people>""" dbutils.fs.put("/FileStore/tables/test/xml/data/age/ages4.xml",xml2)

val xml3="""<people>  <person>    <age born="1990-02-24">25</age>    <name>Hyukjin</name>  </person>  <person>    <age born="1985-01-01"></age>  </person>  <person>    <age born="1980-01-01">30</age>  </person></people>""" dbutils.fs.put("/FileStore/tables/test/xml/data/age/ages5.xml",xml3)

%fs head /FileStore/tables/test/xml/data/age/ages4.xml

import com.databricks.spark.xml.functions.from_xmlimport com.databricks.spark.xml.schema_of_xmlimport spark.implicits._import com.databricks.spark.xml._import org.apache.spark.sql.functions.{input_file_name}  val toStrUDF = udf((bytes: Array[Byte]) => new String(bytes, "UTF-8")) // UDF to convert the binary to String val df_schema = spark.read.format("binaryFile").load("/FileStore/tables/test/xml/data/age/").select(toStrUDF($"content").alias("text"))  val payloadSchema = schema_of_xml(df_schema.select("text").as[String]) // This is costlier operation when we have too many files because of file-listing schema inference, it is best to use the user-defined custom schema  

 val df = spark.readStream.format("cloudFiles")  .option("cloudFiles.useNotifications", "false")  .option("cloudFiles.format", "binaryFile")  .load("/FileStore/tables/test/xml/data/age/")  .select(toStrUDF($"content").alias("text")).select(from_xml($"text", payloadSchema).alias("parsed"))  .withColumn("path",input_file_name) 

display(df)

Streaming XML example(Scala)