# IoT Sensor Data Simulator

# Run the simulator with the following command at the terminal:
# python simulator.py 10000 > ../data/sensor_data.txt

# Imports
import re
import sys
import copy
import random
import string
import datetime
from random import randrange

# Defines the number of messages generated.
# If no value is informed, generates 10 values.
# IMPORTANT: IN THIS COMMENTED SNIPPET BELOW, IT USES 'SYS.ARGV' THAT WON'T WORK ON THIS NOTEBOOK,
# SO I'LL SET THE num_msgs to 10 by default, just so we can check its output.
#if len(sys.argv) > 1:
#  num_msgs = int(sys.argv[1])
#else:
#  num_msgs = 10

num_msgs = 10

# Set sensor base temperature
dic_temp_sensors = {'sensor1': 38.3, 'sensor2': 45.3, 'sensor3': 31.8, 'sensor4': 73.1, 'sensor5': 71.8, 'sensor6': 63.7, 
										 'sensor7': 80.7, 'sensor8': 52.0, 'sensor9': 64.1, 'sensor10': 62.7, 'sensor11': 73.4, 'sensor12': 54.2, 
										 'sensor13': 76.4, 'sensor14': 49.0, 'sensor15': 50.4, 'sensor16': 58.8, 'sensor17': 47.6, 'sensor18': 55.4, 
										 'sensor19': 58.8, 'sensor20': 49.4, 'sensor21': 59.9, 'sensor22': 45.1, 'sensor23': 55.1, 'sensor24': 16.6, 
										 'sensor25': 42.8, 'sensor26': 50.4, 'sensor27': 32.9, 'sensor28': 71.8, 'sensor29': 33.5, 'sensor30': 71.7, 
										 'sensor31': 37.8, 'sensor32': 69.6, 'sensor33': 50.3, 'sensor34': 84.4, 'sensor35': 79.0, 'sensor36': 11.0, 
										 'sensor37': 64.2, 'sensor38': 57.9, 'sensor39': 60.7, 'sensor40': 58.6, 'sensor41': 64.5, 'sensor42': 31.2, 
										 'sensor43': 54.4, 'sensor44': 40.1, 'sensor45': 44.3, 'sensor46': 62.7, 'sensor47': 53.4, 'sensor48': 52.4, 
										 'sensor49': 45.6, 'sensor50': 58.4}

# Base ID for each sensor
id_base_sensor = "S-HAR-PORT-DATA-19951-"

# Base ID for each equipment
id_base_equipment = "E-HAR-PORT-DATA-25015-"

# Sensor standard readout
readout = "iot:reading:sensor:temp"

# Set everything to upper case
letters = string.ascii_uppercase

# String with readout standard
header_reading_iot = """\
{ "id_sensor": "%s",
  "id_equipment": "%s",
  "sensor": "%s", """

iotmsg_date_event = """\
  "date_event": "%sZ", """

iotmsg_format = """\
  "standard": {"format": "%s", """

iotmsg_data ="""\
	"reading": { "temperature": %.1f  }   
	 }
}"""

# Sensor ID mapping dictionary
dic_map_sensor_id = {} 

# Latest measurement dictionary
dic_current_temp = {}

# Generates JSON output
if __name__ == "__main__":

	# Loop from 0 until the numbers of messages defined when running the simulator
	for counter in range(0, num_msgs):

		# Generates 3 random numbers
		rand_num = str(random.randrange(0, 9)) + str(random.randrange(0, 9)) + str(random.randrange(0, 9))

		# Generates 2 random letters
		rand_letter = random.choice(letters) + random.choice(letters)

		# Generates random value for temperature with uniform distribution
		rand_temp_value = random.uniform(-5, 5)

		# Generates another random value following an uniform distribution
		rand_temp_value_delta = random.uniform(-1, 1)

		# Sensor ID gets the base value plus the generate values.
		id_sensor = id_base_sensor + rand_num + rand_letter 

		# Equipment ID gets the base value plus the generate values.
		id_equipment = id_base_equipment + rand_num + rand_letter 

		# Selects random values from dictionaries.
		sensor = random.choice(list(dic_temp_sensors.keys())) 

		# If sensor is not associated with mapping, runs association.
		if (not id_sensor in dic_map_sensor_id): 

			# Includes sensor in the list.
			dic_map_sensor_id[id_sensor] = sensor 

			# Includes temperature in the list.
			dic_current_temp[id_sensor] = dic_temp_sensors[sensor] + rand_temp_value
			
		# If sensor is not on final list, includes it.
		elif (not dic_map_sensor_id[id_sensor] == sensor):		
			sensor = dic_map_sensor_id[id_sensor]

		# Extra temperature adjustment for it to be as random as possible.
		temperature = dic_current_temp[id_sensor] + rand_temp_value_delta
		dic_current_temp[id_sensor] = temperature

		# Writes current time to script's events.
		today = datetime.datetime.today() 
		date_event = today.isoformat()

		# Prints results in JSON format.
		print(re.sub(r"[\s+]", "", header_reading_iot) % (id_sensor, id_equipment, sensor),
					re.sub(r"[\s+]", "", iotmsg_date_event) % (date_event),
					re.sub(r"[\s+]", "", iotmsg_format) % (readout),
					re.sub(r"[\s+]", "", iotmsg_data) % (temperature))


# Chek the Python version:
from platform import python_version
print('Python version:', python_version())

Python version: 3.9.13


# If not installed yet, install findspark.
# !pip install findspark


# Imports findspark and initializes it.
import findspark
findspark.init()


# Import required modules.
# NOTE: We only need to use .streaming to get the data. See how we are using much more SQL libraries?
# In the end, all the data we are retrieving is to solve a business problem through data analysis.
import pyspark
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import col, from_json


# Connector
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'


# Spark versions used in this project
%reload_ext watermark
%watermark -a "Helio Ribeiro" --iversions

Author: Helio Ribeiro

findspark: 2.0.1
pyspark  : 3.4.0


# Create Spark session
spark = SparkSession.builder.appName("RealTimeProject").getOrCreate()

23/07/22 19:00:48 WARN Utils: Your hostname, Helios-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.20.10.12 instead (on interface en0)
23/07/22 19:00:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address

:: loading settings :: url = jar:file:/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /Users/helioribeiro/.ivy2/cache
The jars for the packages stored in: /Users/helioribeiro/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d35725a3-b00a-4c9f-8a25-bfe19276b5d6;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 241ms :: artifacts dl 13ms
	:: modules in use:
	com.google.code.findbugs#jsr305;3.0.0 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	org.apache.commons#commons-pool2;2.11.1 from central in [default]
	org.apache.hadoop#hadoop-client-api;3.3.2 from central in [default]
	org.apache.hadoop#hadoop-client-runtime;3.3.2 from central in [default]
	org.apache.kafka#kafka-clients;2.8.1 from central in [default]
	org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.0 from central in [default]
	org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.0 from central in [default]
	org.lz4#lz4-java;1.8.0 from central in [default]
	org.slf4j#slf4j-api;1.7.32 from central in [default]
	org.spark-project.spark#unused;1.0.0 from central in [default]
	org.xerial.snappy#snappy-java;1.1.8.4 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   12  |   0   |   0   |   0   ||   12  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-d35725a3-b00a-4c9f-8a25-bfe19276b5d6
	confs: [default]
	0 artifacts copied, 12 already retrieved (0kB/5ms)
23/07/22 19:00:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Create topic subscription of the data we desire to pull from Kafka.
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "helioport") \
  .load()


# Defining the data schema we desire to capture for analysis (temperature).
temp_data_schema = StructType([StructField("reading", 
                                             StructType([StructField("temperature", DoubleType(), True)]), True)])


# Defining global data schema for this streaming
data_schema = StructType([ 
    StructField("id_sensor", StringType(), True), 
    StructField("id_equipment", StringType(), True), 
    StructField("sensor", StringType(), True), 
    StructField("date_event", StringType(), True), 
    StructField("standard", temp_data_schema, True)
])


# Capture each data value as a string
df_convert = df.selectExpr("CAST(value AS STRING)")


# Parse JSON as dataframe.
df_convert = df_convert.withColumn("jsonData", from_json(col("value"), data_schema)).select("jsonData.*")


df_convert.printSchema()

root
 |-- id_sensor: string (nullable = true)
 |-- id_equipment: string (nullable = true)
 |-- sensor: string (nullable = true)
 |-- date_event: string (nullable = true)
 |-- standard: struct (nullable = true)
 |    |-- reading: struct (nullable = true)
 |    |    |-- temperature: double (nullable = true)


# Ranming columns for simplified analysis
df_convert_temp_sensor = df_convert.select(col("standard.reading.temperature").alias("temperature"), 
                                               col("sensor"))


df_convert_temp_sensor.printSchema()

root
 |-- temperature: double (nullable = true)
 |-- sensor: string (nullable = true)


# We are unable to visualize the dataframe with .head because it came from streaming.
# Queries with streaming sources must be executed with writeStream.start();kafka


# This is the object that will contain the analysis, with the average temperature per sensor.
df_avg_temp_sensor = df_convert_temp_sensor.groupby("sensor").mean("temperature")


df_avg_temp_sensor.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- avg(temperature): double (nullable = true)


# Renaming columns for simplifies analysis
df_avg_temp_sensor = df_avg_temp_sensor.select(col("sensor").alias("sensor"), 
                                                   col("avg(temperature)").alias("avg_temp"))


df_avg_temp_sensor.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- avg_temp: double (nullable = true)


# Object that initiates the straming with console format.
# This is the moment where Spark estabilishes the connection with Kafka to retrieve data.
query = df_avg_temp_sensor.writeStream.outputMode("complete").format("console").start()

23/07/22 19:00:57 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/vy/8tkfcqt12f31vsh13f_sb00h0000gn/T/temporary-a55c30d0-e95b-47ea-916d-6a33d05a0d8b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/07/22 19:00:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/07/22 19:00:57 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
23/07/22 19:00:57 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
23/07/22 19:00:57 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
23/07/22 19:00:57 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
23/07/22 19:00:57 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.

-------------------------------------------
Batch: 0
-------------------------------------------
+------+--------+
|sensor|avg_temp|
+------+--------+
+------+--------+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------+------------------+
|  sensor|          avg_temp|
+--------+------------------+
| sensor7| 82.10476190476192|
|sensor34| 85.08421052631579|
|sensor41|           65.2375|
|sensor50| 60.07500000000001|
|sensor38|  57.1578947368421|
|sensor31| 37.39411764705883|
| sensor1| 39.25454545454546|
|sensor30| 70.92307692307692|
|sensor10| 61.41111111111112|
|sensor25| 43.15833333333333|
| sensor4|           73.3375|
| sensor5| 72.50555555555556|
|sensor20| 48.91428571428572|
|sensor44|           40.8375|
|sensor19|58.720000000000006|
| sensor8|51.278571428571425|
|sensor14| 49.12857142857142|
|sensor24|16.544999999999995|
|sensor43|             54.65|
|sensor47|53.775000000000006|
+--------+------------------+
only showing top 20 rows

-------------------------------------------
Batch: 2
-------------------------------------------
+--------+------------------+
|  sensor|          avg_temp|
+--------+------------------+
| sensor7|  82.1909090909091|
|sensor34| 85.20869565217392|
|sensor41| 64.63684210526316|
|sensor50|59.828571428571436|
|sensor31|37.790000000000006|
|sensor38| 56.96190476190476|
| sensor1| 38.90833333333334|
|sensor30| 70.63333333333334|
|sensor10| 61.89000000000001|
|sensor25| 43.05384615384615|
| sensor4| 73.39000000000001|
| sensor5| 72.39166666666667|
|sensor20| 49.06000000000001|
|sensor44| 40.75714285714285|
|sensor19| 58.81304347826087|
| sensor8|51.231249999999996|
|sensor14| 49.06086956521739|
|sensor24| 16.18181818181818|
|sensor43| 55.33076923076923|
|sensor47| 53.67857142857144|
+--------+------------------+
only showing top 20 rows

-------------------------------------------
Batch: 3
-------------------------------------------
+--------+------------------+
|  sensor|          avg_temp|
+--------+------------------+
| sensor7| 82.21739130434784|
|sensor34|            84.916|
|sensor41| 64.63684210526316|
|sensor50| 60.22500000000001|
|sensor31|37.790000000000006|
|sensor38|56.979166666666664|
| sensor1|             38.85|
|sensor30|            70.875|
|sensor10|61.775000000000006|
|sensor25|           43.0875|
| sensor4|  73.3904761904762|
| sensor5| 72.55555555555556|
|sensor20| 48.68000000000001|
|sensor44| 40.47727272727273|
|sensor19| 58.87692307692308|
| sensor8| 51.07058823529411|
|sensor14|48.912499999999994|
|sensor24| 16.03478260869565|
|sensor43| 55.05555555555556|
|sensor47|54.022222222222226|
+--------+------------------+
only showing top 20 rows

-------------------------------------------
Batch: 4
-------------------------------------------
+--------+------------------+
|  sensor|          avg_temp|
+--------+------------------+
| sensor7|            81.885|
|sensor34| 84.19565217391305|
|sensor41| 64.33846153846153|
|sensor50| 59.61785714285715|
|sensor31| 37.92580645161291|
|sensor38|57.583720930232566|
| sensor1|             39.14|
|sensor30| 70.65454545454546|
|sensor10| 61.87419354838711|
|sensor25|          43.85625|
| sensor4| 73.54324324324325|
| sensor5| 72.95897435897436|
|sensor20| 49.13023255813953|
|sensor44| 40.63333333333334|
|sensor19|58.873684210526314|
| sensor8|52.411627906976726|
|sensor14|48.894594594594594|
|sensor24|16.634210526315787|
|sensor43| 54.45277777777778|
|sensor47|              53.5|
+--------+------------------+
only showing top 20 rows


# Execute streaming query and avoid the process to be terminated.
#query.awaitTermination()


query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}


query.lastProgress

{'id': 'a7db1898-818b-4a09-a775-e1501ef07d72',
 'runId': '30e5f81a-950a-4ddd-99c0-d3a331bc84b5',
 'name': None,
 'timestamp': '2023-07-22T17:01:46.927Z',
 'batchId': 5,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 2, 'triggerExecution': 2},
 'stateOperators': [{'operatorName': 'stateStoreSave',
   'numRowsTotal': 50,
   'numRowsUpdated': 0,
   'allUpdatesTimeMs': 677,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 0,
   'commitTimeMs': 11963,
   'memoryUsedBytes': 101936,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 200,
   'numStateStoreInstances': 200,
   'customMetrics': {'loadedMapCacheHitCount': 1600,
    'loadedMapCacheMissCount': 0,
    'stateOnCurrentVersionSizeBytes': 30568}}],
 'sources': [{'description': 'KafkaV2[Subscribe[helioport]]',
   'startOffset': {'helioport': {'0': 10000}},
   'endOffset': {'helioport': {'0': 10000}},
   'latestOffset': {'helioport': {'0': 10000}},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0,
   'metrics': {'avgOffsetsBehindLatest': '0.0',
    'maxOffsetsBehindLatest': '0',
    'minOffsetsBehindLatest': '0'}}],
 'sink': {'description': 'org.apache.spark.sql.execution.streaming.ConsoleTable$@1192a010',
  'numOutputRows': 0}}


query.explain()

== Physical Plan ==
WriteToDataSourceV2 MicroBatchWrite[epoch: 4, writer: ConsoleWriter[numRows=20, truncate=true]], org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy$$Lambda$2479/0x00000008011cc040@521476fa
+- *(4) HashAggregate(keys=[sensor#28], functions=[avg(temperature#36)])
   +- StateStoreSave [sensor#28], state info [ checkpoint = file:/private/var/folders/vy/8tkfcqt12f31vsh13f_sb00h0000gn/T/temporary-a55c30d0-e95b-47ea-916d-6a33d05a0d8b/state, runId = 30e5f81a-950a-4ddd-99c0-d3a331bc84b5, opId = 0, ver = 4, numPartitions = 200], Complete, 0, 0, 2
      +- *(3) HashAggregate(keys=[sensor#28], functions=[merge_avg(temperature#36)])
         +- StateStoreRestore [sensor#28], state info [ checkpoint = file:/private/var/folders/vy/8tkfcqt12f31vsh13f_sb00h0000gn/T/temporary-a55c30d0-e95b-47ea-916d-6a33d05a0d8b/state, runId = 30e5f81a-950a-4ddd-99c0-d3a331bc84b5, opId = 0, ver = 4, numPartitions = 200], 2
            +- *(2) HashAggregate(keys=[sensor#28], functions=[merge_avg(temperature#36)])
               +- Exchange hashpartitioning(sensor#28, 200), ENSURE_REQUIREMENTS, [plan_id=1016]
                  +- *(1) HashAggregate(keys=[sensor#28], functions=[partial_avg(temperature#36)])
                     +- *(1) Project [jsonData#23.standard.reading.temperature AS temperature#36, jsonData#23.sensor AS sensor#28]
                        +- Project [from_json(StructField(id_sensor,StringType,true), StructField(id_equipment,StringType,true), StructField(sensor,StringType,true), StructField(date_event,StringType,true), StructField(standard,StructType(StructField(reading,StructType(StructField(temperature,DoubleType,true)),true)),true), cast(value#8 as string), Some(Europe/Madrid)) AS jsonData#23]
                           +- MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan


# Object that initiates streaming with memory format (temporary table)
query_memory = df_avg_temp_sensor \
    .writeStream \
    .queryName("Helio_Kafka_Project") \
    .outputMode("complete") \
    .format("memory") \
    .start()

23/07/22 19:01:55 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/vy/8tkfcqt12f31vsh13f_sb00h0000gn/T/temporary-7ba0a812-4f0a-44aa-8040-e1f6479d33b4. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/07/22 19:01:55 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/07/22 19:01:55 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
23/07/22 19:01:55 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
23/07/22 19:01:55 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
23/07/22 19:01:55 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known config.
23/07/22 19:01:55 WARN AdminClientConfig: The configuration 'auto.offset.reset' was supplied but isn't a known config.


# Active streams
spark.streams.active

[<pyspark.sql.streaming.query.StreamingQuery at 0x10bb06970>,
 <pyspark.sql.streaming.query.StreamingQuery at 0x10bce5d30>]


# Maintain query execution and apply SQL to the data in real-time.
from time import sleep

for x in range(10):
    
    spark.sql("select sensor, round(avg_temp, 2) as avg from Helio_Kafka_Project where avg_temp > 65").show()
    sleep(3)
    
query_memory.stop()

+------+---+
|sensor|avg|
+------+---+
+------+---+

+------+---+
|sensor|avg|
+------+---+
+------+---+

+------+---+
|sensor|avg|
+------+---+
+------+---+

[Stage 13:=========>    (141 + 8) / 200][Stage 15:>               (0 + 0) / 200]

+------+---+
|sensor|avg|
+------+---+
+------+---+

-------------------------------------------
Batch: 5
-------------------------------------------
+--------+------------------+
|  sensor|          avg_temp|
+--------+------------------+
| sensor7|             81.86|
|sensor34| 84.04313725490196|
|sensor41| 64.38478260869564|
|sensor50|59.612500000000004|
|sensor31| 38.07714285714286|
|sensor38|57.692592592592604|
| sensor1|39.135714285714286|
|sensor30|             70.72|
|sensor10| 61.71621621621623|
|sensor25|43.973684210526315|
| sensor4| 73.64500000000001|
| sensor5|            73.075|
|sensor20| 48.86666666666666|
|sensor44| 40.31458333333334|
|sensor19| 58.77619047619047|
| sensor8| 52.35882352941175|
|sensor14|           48.9075|
|sensor24|16.583720930232555|
|sensor43|54.311363636363645|
|sensor47| 53.23265306122449|
+--------+------------------+
only showing top 20 rows

-------------------------------------------
Batch: 6
-------------------------------------------
+--------+------------------+
|  sensor|          avg_temp|
+--------+------------------+
| sensor7| 81.76896551724138|
|sensor34| 83.85797101449275|
|sensor41| 64.24237288135593|
|sensor50| 59.54761904761905|
|sensor31| 37.99047619047619|
|sensor38| 57.78461538461539|
| sensor1| 39.18793103448276|
|sensor30| 70.66078431372549|
|sensor10| 61.87115384615385|
|sensor25| 44.06078431372549|
| sensor4| 73.60000000000001|
| sensor5| 73.21111111111111|
|sensor20|49.145070422535206|
|sensor44|40.592063492063495|
|sensor19|              58.9|
| sensor8|52.681428571428555|
|sensor14|48.819607843137256|
|sensor24|16.818518518518516|
|sensor43| 54.25932203389831|
|sensor47|53.456896551724135|
+--------+------------------+
only showing top 20 rows

+--------+-----+
|  sensor|  avg|
+--------+-----+
| sensor7|81.66|
|sensor34|82.64|
|sensor30| 71.8|
| sensor4| 74.9|
| sensor5|73.98|
|sensor28|69.98|
|sensor11|73.09|
|sensor35|79.14|
|sensor13| 75.8|
|sensor32|70.62|
+--------+-----+

+--------+-----+
|  sensor|  avg|
+--------+-----+
| sensor7|81.51|
|sensor34|83.18|
|sensor30|70.67|
| sensor4|73.72|
| sensor5|73.87|
|sensor28|71.35|
|sensor11| 74.2|
|sensor35|80.08|
|sensor13|76.27|
|sensor32|70.35|
+--------+-----+

**REAL-TIME IOT SENSOR TEMPERATURE ANALYSIS** ¶

**Big Data Real-Time Analytics with Python, Apache Spark and Apache Kafka**¶

TABLE OF CONTENTS¶

SUMMARY - ABSTRACT ¶

OBJECTIVE: Analyze Real-Time data using Apache Kafka and PySpark.¶

USED TECHNOLOGIES¶

1. THE BUSINESS PROBLEM & DATA ARCHITECTURE ¶

Seems like a difficult job to pull off, right? Well, I wouldn't say it's easy, but bare with me:

Let's take a look into how we will conduct the Data Architecture / Structure:¶

2. DATA INPUT ¶

2.1. PYTHON - DATA GENERATOR ¶

2.2. DATA OUTPUT EXAMPLES ¶

3. APACHE KAFKA ¶

3.1. ZOOKEEPER - Cluster manager initialization ¶

3.2. Kafka Initialization¶

3.3. Topics creation ¶

3.4. Data stream start ¶

4. APACHE SPARK (PYSPARK) - RUNNING THE CODE ¶

4.1. Creating Spark session ¶

4.2. Kafka/Spark Structured Stream read ¶

4.3. Data source schema definition ¶

4.4. Data source parsing ¶

4.5. Dataframe preparation ¶

4.6. SQL - Real-Time data analysis preparation¶

4.7. SQL - Real-Time data visualization and different run methods ¶

5. ENDING & FINAL REMARKS ¶

CONTACT INFO:¶

REAL-TIME IOT SENSOR TEMPERATURE ANALYSIS ¶

Big Data Real-Time Analytics with Python, Apache Spark and Apache Kafka¶

Seems like a difficult job to pull off, right?
Well, I wouldn't say it's easy, but bare with me:

3.2. Kafka Initialization ¶

4.6. SQL - Real-Time data analysis preparation ¶