# https://kafka-python.readthedocs.io/en/master/
# !pip install -q kafka-python
# !pip install -q -U watermark


# LIBRARIES
import time
import random
import kafka
import numpy as np
import pandas as pd
from json import dumps
from kafka import KafkaProducer
import warnings
warnings.filterwarnings('ignore')


from platform import python_version
print('Author: Helio Ribeiro')
print('Python version:', python_version())
print('\nPackage versions:')
%reload_ext watermark
%watermark --iversions

Author: Helio Ribeiro
Python version: 3.9.12

Package versions:
numpy : 1.21.5
kafka : 2.0.2
pandas: 1.4.2


# KAFKA SERVER & TOPICNAME
SERVER = 'localhost:9092'
TOPIC = "heliospotifyproject"


# DATA LOAD
df_heliospotifyproject = pd.read_csv("data/dataset.csv")


df_heliospotifyproject['order_id'] = np.arange(len(df_heliospotifyproject))
df_heliospotifyproject['Artist Name(s)'] = df_heliospotifyproject['Artist Name(s)'].str.replace('[^a-zA-Z]', '')
df_heliospotifyproject['Artist IDs'] = df_heliospotifyproject['Artist IDs'].str.replace('[^a-zA-Z]', '')


df_heliospotifyproject.shape

(4399, 24)


df_heliospotifyproject.head(10)


dict_musics = df_heliospotifyproject.to_dict(orient = "records")


dict_musics[1:3]

[{'Spotify ID': '4J39ZEbwqHwtWLImUKmrn9',
  'Artist IDs': 'CRfAxYjJsDBHwvWFnjaRRRPXwFwQmoTNqNHBGU',
  'Track Name': '88 Days',
  'Album Name': 'Heat',
  'Artist Name(s)': 'SaraKingIanOlney',
  'Release Date': '2018-08-04',
  'Duration (ms)': 227961,
  'Popularity': 8,
  'Added By': 'spotify:user:predict0',
  'Added At': '2018-08-28T19:51:58Z',
  'Genres': 'bedroom pop',
  'Danceability': 0.335,
  'Energy': 0.401,
  'Key': 3,
  'Loudness': -10.749,
  'Mode': 1,
  'Speechiness': 0.0333,
  'Acousticness': 0.134,
  'Instrumentalness': 0.582,
  'Liveness': 0.134,
  'Valence': 0.233,
  'Tempo': 155.062,
  'Time Signature': 4,
  'order_id': 1},
 {'Spotify ID': '0a12d4HUjOmQSqHqLopWYx',
  'Artist IDs': 'hytHTGTflktWAhKcxQ',
  'Track Name': 'Castaway',
  'Album Name': 'Castaway',
  'Artist Name(s)': 'ARZLEE',
  'Release Date': '2018-08-10',
  'Duration (ms)': 230000,
  'Popularity': 0,
  'Added By': 'spotify:user:predict0',
  'Added At': '2018-08-28T19:51:58Z',
  'Genres': nan,
  'Danceability': 0.553,
  'Energy': 0.422,
  'Key': 1,
  'Loudness': -11.29,
  'Mode': 1,
  'Speechiness': 0.0314,
  'Acousticness': 0.11,
  'Instrumentalness': 3.25e-05,
  'Liveness': 0.119,
  'Valence': 0.29,
  'Tempo': 83.988,
  'Time Signature': 4,
  'order_id': 2}]


# KAFKA PRODUCER
if __name__ == "__main__":

    # PRODUCES
    producer = KafkaProducer(bootstrap_servers = SERVER, 
                             value_serializer = lambda x: x.encode('utf-8'))
    
    
    send = []    
    send = None

    # CREATE LIST BASED ON AVAILABLE MUSICS AND SEND THEM TO KAFKA
    for music in dict_musics:
        
        
        sending = []
        
        
        sending.append(music["order_id"])
        sending.append(music["Spotify ID"])
        sending.append(music["Track Name"])
        sending.append(music["Popularity"])
        sending.append(music["Duration (ms)"])
        sending.append(music["Artist Name(s)"])
        sending.append(music["Artist IDs"])
        sending.append(music["Release Date"])
        sending.append(music["Danceability"])
        sending.append(music["Energy"])
        sending.append(music["Key"])
        sending.append(music["Loudness"])
        sending.append(music["Mode"])
        sending.append(music["Speechiness"])
        sending.append(music["Acousticness"])
        sending.append(music["Instrumentalness"])
        sending.append(music["Liveness"])
        sending.append(music["Valence"])
        sending.append(music["Tempo"])
        sending.append(music["Time Signature"])
        
        # JOIN EVERYTHING TOGETHER
        music = ','.join(str(v) for v in sending)

        # SEND DATA
        print("Next Music:" )
        print(music)
        producer.send(TOPIC, music)
        time.sleep(1)

    print("Done")

Next Music:
0,22a0Ji6EQKkY0tBohlN4Od,There You Are,2,231240,KirstenLudwig,qLyYYhSlsjwymwVKwW,2018-08-06,0.487,0.707,9,-5.596,0,0.0304,0.334,0.282,0.105,0.316,129.856,4
Next Music:
1,4J39ZEbwqHwtWLImUKmrn9,88 Days,8,227961,SaraKingIanOlney,CRfAxYjJsDBHwvWFnjaRRRPXwFwQmoTNqNHBGU,2018-08-04,0.335,0.401,3,-10.749,1,0.0333,0.134,0.582,0.134,0.233,155.062,4
Next Music:
2,0a12d4HUjOmQSqHqLopWYx,Castaway,0,230000,ARZLEE,hytHTGTflktWAhKcxQ,2018-08-10,0.553,0.422,1,-11.29,1,0.0314,0.11,3.25e-05,0.119,0.29,83.988,4
Next Music:
3,4u1DykFW1HjYAGNoDCiXfC,Arouse,30,213913,Shagabondgoodboynoah,WjyoJHRHlTbUTZTwqpAgeqmtJlARXjon,2018-08-03,0.67,0.751,1,-6.066,1,0.433,0.0728,0.0,0.368,0.533,91.961,4
Next Music:
4,0u7JZm9ORerlZnnxxSdMwl,Lonely,21,258738,Hayleau,AdKmjgFzpcTvmVfGwR,2018-08-10,0.67,0.709,8,-3.921,0,0.0406,0.0169,0.00063,0.0542,0.577,98.954,4
Next Music:
5,0wuy2BYIVLbflFDqnR9Jay,Orsay,6,413658,TheSvens,kCwrYUFSJCubbbnZrE,2018-08-03,0.61,0.444,0,-11.858,1,0.0316,0.0486,0.886,0.128,0.283,122.992,4
Next Music:
6,6LkIZZRrPQIbHMyBR5mTc2,Nurture,0,191641,IslandFox,TOsWuafqeWtrvYXqbnYAV,2018-08-09,0.324,0.808,7,-11.415,0,0.0504,0.0159,0.639,0.181,0.266,133.925,4
Next Music:
7,5U27fxNSd27XtX876xUsfV,Dinosaur Hair - Remix,1,257152,AndyFerroCharlieConway,HsKUExgNcRJojPmBcNqzgwpvzedAIjuDBM,2018-08-10,0.814,0.53,0,-10.086,1,0.0383,0.456,0.92,0.135,0.663,125.908,4
Next Music:
8,5ogJOpmyDsvrAdttU6JLnN,Breathing Underwater,0,174999,MorningWars,gslbnQQLLcNzfjnxQY,2018-08-03,0.361,0.687,1,-11.258,0,0.0461,4.53e-06,0.0204,0.115,0.477,150.042,4
Next Music:
9,65rLHt6A58MFRxlNWVDU1Z,Summer,22,232746,NoSo,WlYiRrlrChWktQDo,2018-08-01,0.771,0.587,4,-7.517,1,0.038,0.0168,0.00723,0.0706,0.21,123.962,4
Next Music:
10,1F8360UuztzClhrF9OjxNG,Honest.,5,220312,KemiAde,KDipZITiqyiYakmvUP,2018-08-03,0.444,0.537,5,-8.364,0,0.114,0.217,0.0,0.259,0.498,77.264,4
Next Music:
11,1UAc7PQPYO0oKWiyVHf5Cl,Surrender,9,192000,MeganGageAabo,nRMXgTIubQCUtqyIWQMCFlLlfXNwakWzTn,2018-08-10,0.593,0.613,4,-7.98,0,0.0523,0.00858,1.78e-05,0.495,0.263,119.998,4
Next Music:
12,6c14rjEjfZYJMjIW7mwARr,High Demand (feat. Maxpain) [Cousin's Story],0,246949,ArrowNandeViceMaxpain,oIzfchABetwVPprYyeETwNmkDsmHsHpRoIEoHUTWzxiMXtok,2018-08-10,0.719,0.445,4,-8.105,0,0.051,0.206,0.0,0.0808,0.27,139.988,4
Next Music:
13,4CvV6VfT6taArl6ZnO85qK,Options,30,124540,thuy,ROERViOWbnuvqhja,2018-08-06,0.762,0.472,6,-7.352,1,0.0431,0.00276,6.97e-05,0.107,0.412,92.45,4
Next Music:
14,1t4iNo42J9rUg3zRKoUGuf,Echo Chamber,0,216981,AuntySocial,EwIzEFKHYdAQUL,2018-08-02,0.584,0.56,6,-9.233,0,0.256,0.273,2.66e-05,0.311,0.339,135.097,4
Next Music:
15,656sRwYGUOiuJHtL4c61gg,Drown,3,225230,SILKINKayvahn,BHnUSyoIJvmMziIxMtFDVwpnGZDbUrCkQTRL,2018-08-02,0.637,0.488,9,-10.212,0,0.0457,0.687,0.0,0.0466,0.366,130.05,4
Next Music:
16,6EMDSb7h7aMvPaOLFyHgtV,Another World,0,249920,MAYAbiLLLy,vgPjTFLBOAdbvuBNekOkofUFYUMlQYQyCuD,2018-08-09,0.459,0.459,0,-9.527,1,0.134,0.548,0.0003,0.0935,0.725,162.826,5
Next Music:
17,2hmvAQaRCV9uXS9zgOTdql,Forgiveness,8,148085,CassetteTapes,HJQlchOVokaUSV,2018-08-14,0.735,0.576,9,-6.242,0,0.125,0.264,2.95e-05,0.111,0.167,94.168,4
Next Music:
18,6aGD4TJJcYSaYTPYc9IYQq,Silkworm Society,0,326381,NowVsNow,FjOKQvLFCXHYgVsZlmy,2018-08-10,0.512,0.664,11,-11.977,1,0.0385,0.64,0.218,0.272,0.304,97.568,4
Next Music:
19,6I4aAn84IgKFVrqRYl27b6,Pep Ventura,2,388000,SamOB,fcUcAZmcUODOxQq,2018-08-03,0.801,0.737,9,-9.014,1,0.0633,0.116,0.908,0.382,0.428,119.996,4
Next Music:
20,3A6pUp13Fnedvp0gCdFbsw,Let's Fly,0,115746,AsaBuchanon,JaXdhMrVcYCnhZp,2018-08-03,0.768,0.705,8,-8.369,1,0.144,0.00018,0.000248,0.359,0.811,110.578,4
Next Music:
21,75NIWnnH4Xhxq3IvWq78dm,Pleasure,13,169248,FHAT,ltQmRqfdWeirSIzCV,2018-07-27,0.691,0.402,5,-9.397,0,0.0927,0.39,3.31e-06,0.102,0.578,79.929,4
Next Music:
22,2OgNRfvvOcQWp1F5WrBjY9,Hindsight,1,221538,CLLLAPS,zbotleVbvaaVcWollHnD,2018-08-03,0.443,0.589,0,-8.564,0,0.0477,0.0723,0.00237,0.15,0.382,142.832,4
Next Music:
23,5L5AB6Ps6wO9FDgfoZ1ZKk,Blackheart Heights,0,204669,Feign,KtseXhYTGVhOvUkEU,2018-07-22,0.669,0.564,5,-6.512,1,0.0778,0.0681,0.0,0.1,0.148,83.057,4
Next Music:
24,7hdM3l7whE3lTHP6WsoKZT,Reason,46,200000,HablotBrownMathsTimeJoy,LtgEnShwvrqAaKohgskMwCPkzFZbGfRlPHyK,2018-07-29,0.799,0.309,5,-8.462,0,0.165,0.164,0.00037,0.0854,0.553,89.964,4
Next Music:
25,0DZ3ER7zgfksLAeEfycvQW,I'm Gone,27,286895,RomeinSilver,HhXQxTHPEdlASgIN,2018-07-27,0.648,0.677,1,-7.49,0,0.155,0.0617,0.0378,0.352,0.278,74.936,4
Next Music:
26,626pd4EMcMFAJzAYPLdmDR,Flowerbomb,14,190149,SienaLiggins,DLTBcpdWQsAPeNtPZv,2018-07-27,0.599,0.729,8,-6.691,1,0.0658,0.0428,0.000109,0.164,0.258,173.894,4
Next Music:
...


# pip install pyspark==3.3.2


# LIBRARIES
import os
import time
import random
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler


from platform import python_version
print('Author: Helio Ribeiro')
print('Python version:', python_version())
print('\nPackage versions:')
%reload_ext watermark
%watermark --iversions

Author: Helio Ribeiro
Python version: 3.9.12

Package versions:
numpy  : 1.21.5
kafka  : 2.0.2
pyspark: 3.3.2
pandas : 1.4.2
sys    : 3.9.12 (main, Jun  1 2022, 06:34:44) 
[Clang 12.0.0 ]


# KAFKA SERVER & TOPICNAME
SERVER = 'localhost:9092'
TOPIC = "heliospotifyproject"


# SPARK CONNECTORS FOR KAFKA
spark_jars =  ("{},{},{},{},{}".format(os.getcwd() + "/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar",  
                                       os.getcwd() + "/jars/kafka-clients-2.1.1.jar", 
                                       os.getcwd() + "/jars/spark-streaming-kafka-0-10-assembly_2.12-3.3.2.jar", 
                                       os.getcwd() + "/jars/commons-pool2-2.8.0.jar",  
                                       os.getcwd() + "/jars/spark-token-provider-kafka-0-10_2.12-3.1.2.jar"))


# INITIALIZE SPARK SESSION
spark = SparkSession \
        .builder \
        .config("spark.jars", spark_jars) \
        .appName("heliospotifyproject") \
        .getOrCreate()

24/02/14 04:02:16 WARN Utils: Your hostname, Helios-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.6 instead (on interface en0)
24/02/14 04:02:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/02/14 04:02:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


spark.sparkContext.setLogLevel("ERROR")


# USE SPARK STREAMING FOR DATA READ AND SAVE INFO AS DATAFRAME
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", SERVER) \
        .option("subscribe", TOPIC) \
        .option("startingOffsets", "latest") \
        .load()


# SELECT TIMESTAMP COLUMN AS STRING AND SAVE IT INTO A NEW DATAFRAME
df1 = df.selectExpr("CAST(value AS STRING)", "timestamp")


# DEFINE SCHEMA
def_schema = "order_id INT, id STRING, name STRING, popularity INT, duration_ms DOUBLE, " \
             + "artists STRING, id_artists STRING, release_date STRING, " \
             + "danceability DOUBLE,energy DOUBLE, key INT, loudness DOUBLE, " \
             + "mode INT,speechiness DOUBLE," \
             + "acousticness DOUBLE, instrumentalness DOUBLE, liveness DOUBLE, " \
             + "valence DOUBLE, tempo DOUBLE, time_signature DOUBLE"


# SELECT DATASTREAM ACCORDING TO SCHEMA AND SAVE IT INTO A NEW DATAFRAME
df2 = df1.select(from_csv(col("value"), def_schema).alias("song"), "timestamp")


# CREATE 'VIEW' ON SPARK'S MEMORY
df3 = df2.select("song.*", "timestamp")  
df3.createOrReplaceTempView("df3_View");
df3.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- artists: string (nullable = true)
 |-- id_artists: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)


# THEN WE SELECT THE DATA FROM OUR STREAM
music_stream = spark.sql("SELECT * FROM df3_View")


# WE STILL HAVE TO GENERATE SPARK'S STREAMING, SO WE CAN'T VISUALIZE IT JUST YET.
# music_stream.show()


# CREATE SPARK DATA STREAM
music_stream_spark = music_stream \
        .writeStream \
        .trigger(processingTime = '5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("spark_table") \
        .start()

music_stream_spark.awaitTermination(1)

[Stage 0:>                                                          (0 + 0) / 1]

False


# SELECT SONGS VIA SPARK STREAMING
spark_songs = spark.sql("SELECT * FROM spark_table")


# NOW WE CAN VISUALIZE OUR STREAM
spark_songs.show(5)

+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|order_id|                  id|      name|popularity|duration_ms|             artists|          id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|           timestamp|
+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|     101|47mAiKhmnkY9dJ2GU...|     Radio|         1|   162024.0|              Spissy|    JzReCvrdmAkxGCcT|  2018-07-10|       0.616| 0.847|  0|  -4.578|   0|     0.0308|      0.0166|         0.00338|  0.0753|   0.87|114.997|           4.0|2024-02-14 04:03:...|
|     102|13yqiEmOlVXyCJ5rD...|   Control|         0|   109956.0|          lovesadKID|   AKgMMrkCGsURNvyXs|  2018-07-19|       0.626| 0.682|  7|  -9.377|   1|      0.264|       0.133|             0.0|   0.102|  0.596| 95.111|           4.0|2024-02-14 04:03:...|
|     103|694UYYV6nOiT3rUoJ...|     Vices|        11|   150909.0|          LhasaPetik|   EtMqKRBCptLUAYQed|  2018-06-24|       0.727| 0.583|  7|  -6.187|   1|     0.0707|       0.672|             0.0|     0.1|  0.309|131.682|           4.0|2024-02-14 04:03:...|
|     104|5r8lQLxTTAhmltQXu...|Game No Mo|         0|   187531.0|         JennyPenkin| BQvdGvRDDXZtEEyELke|  2018-07-13|       0.701| 0.544|  5|  -5.949|   0|     0.0518|       0.453|          0.0957|   0.167|  0.576| 84.047|           4.0|2024-02-14 04:03:...|
|     105|4Repz6Yn1aABTFj9O...|     Money|         0|   185000.0|FinisMundiLiliann...|gAReFedlCvkPUhxIb...|  2018-07-20|       0.716| 0.543|  9|  -8.972|   0|      0.191|       0.737|         2.13E-6|   0.534|  0.688|107.955|           4.0|2024-02-14 04:03:...|
+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
only showing top 5 rows


# CHECK SOME COLUMNS, FOR INSTANCE:
spark_songs.select('order_id', 'id', 'name', 'popularity', 'duration_ms', 'artists').show(5)

+--------+--------------------+----------+----------+-----------+--------------------+
|order_id|                  id|      name|popularity|duration_ms|             artists|
+--------+--------------------+----------+----------+-----------+--------------------+
|     101|47mAiKhmnkY9dJ2GU...|     Radio|         1|   162024.0|              Spissy|
|     102|13yqiEmOlVXyCJ5rD...|   Control|         0|   109956.0|          lovesadKID|
|     103|694UYYV6nOiT3rUoJ...|     Vices|        11|   150909.0|          LhasaPetik|
|     104|5r8lQLxTTAhmltQXu...|Game No Mo|         0|   187531.0|         JennyPenkin|
|     105|4Repz6Yn1aABTFj9O...|     Money|         0|   185000.0|FinisMundiLiliann...|
+--------+--------------------+----------+----------+-----------+--------------------+
only showing top 5 rows


# COUNT OF SONGS EXTRACTED IN REAL TIME
spark_songs.count()

928


# https://pypi.org/project/spotipy/
# !pip install -q spotipy
# !pip install ujson


# IMPORTS
import os
import ujson
import spotipy
import spotipy.util
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")


%reload_ext watermark
%watermark -a "Helio Ribeiro" --iversions

Author: Helio Ribeiro

matplotlib: 3.5.1
numpy     : 1.21.5
kafka     : 2.0.2
pyspark   : 3.3.2
seaborn   : 0.11.2
pandas    : 1.4.2
sys       : 3.9.12 (main, Jun  1 2022, 06:34:44) 
[Clang 12.0.0 ]
ujson     : 5.9.0
spotipy   : 2.23.0


# ADD YOUR SPOTIFY IDs HERE
os.environ["SPOTIPY_CLIENT_ID"] = 'your_client_ID'
os.environ["SPOTIPY_CLIENT_SECRET"] = 'your_client_secret'
os.environ["SPOTIPY_REDIRECT_URI"] = 'http://localhost:7777/callback'


# USER PREFERENCE SCOPE
scope = 'user-library-read'


# SPOTIFY USERNAME 
username = 'helioribeiropro@gmail.com'


# ACCESS TOKEN CREATION
token = spotipy.util.prompt_for_user_token(username, scope)


# AUTHENTICATION OBJECT
spotipy_obj = spotipy.Spotify(auth = token)


# EXTRACT UP TO 50 SONGS FROM USER'S FAVORITE
saved_tracks = spotipy_obj.current_user_saved_tracks(limit = 50)
print('Saved Tracks: %s ' % saved_tracks)

Saved Tracks: {'href': 'https://api.spotify.com/v1/me/tracks?offset=0&limit=50', 'items': [{'added_at': '2024-02-12T21:08:42Z', 'track': {'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1mcTU81TzQhprhouKaTkpq'}, 'href': 'https://api.spotify.com/v1/artists/1mcTU81TzQhprhouKaTkpq', 'id': '1mcTU81TzQhprhouKaTkpq', 'name': 'Rauw Alejandro', 'type': 'artist', 'uri': 'spotify:artist:1mcTU81TzQhprhouKaTkpq'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM', 'FJ', 'GM', 'GE', 'GD', 'GW', 'GY', 'HT', 'JM', 'KI', 'LS', 'LR', 'MW', 'MV', 'ML', 'MH', 'FM', 'NA', 'NR', 'NE', 'PW', 'PG', 'WS', 'SM', 'ST', 'SN', 'SC', 'SL', 'SB', 'KN', 'LC', 'VC', 'SR', 'TL', 'TO', 'TT', 'TV', 'VU', 'AZ', 'BN', 'BI', 'KH', 'CM', 'TD', 'KM', 'GQ', 'SZ', 'GA', 'GN', 'KG', 'LA', 'MO', 'MR', 'MN', 'NP', 'RW', 'TG', 'UZ', 'ZW', 'BJ', 'MG', 'MU', 'MZ', 'AO', 'CI', 'DJ', 'ZM', 'CD', 'CG', 'IQ', 'LY', 'TJ', 'VE', 'ET', 'XK'], 'external_urls': {'spotify': 'https://open.spotify.com/album/2Nt6MDJXfoxQ22tIQgWXIh'}, 'href': 'https://api.spotify.com/v1/albums/2Nt6MDJXfoxQ22tIQgWXIh', 'id': '2Nt6MDJXfoxQ22tIQgWXIh', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b273d9525f27b0a9e25b1fa21230', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d00001e02d9525f27b0a9e25b1fa21230', 'width': 300}, {'height': 64, 'url': 'https://i.scdn.co/image/ab67616d00004851d9525f27b0a9e25b1fa21230', 'width': 64}], 'name': 'VICE VERSA', 'release_date': '2021-12-10', 'release_date_precision': 'day', 'total_tracks': 14, 'type': 'album', 'uri': 'spotify:album:2Nt6MDJXfoxQ22tIQgWXIh'}, 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1mcTU81TzQhprhouKaTkpq'}, 'href': 'https://api.spotify.com/v1/artists/1mcTU81TzQhprhouKaTkpq', 'id': '1mcTU81TzQhprhouKaTkpq', 'name': 'Rauw Alejandro', 'type': 'artist', 'uri': 'spotify:artist:1mcTU81TzQhprhouKaTkpq'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM', 'FJ', 'GM', 'GE', 'GD', 'GW', 'GY', 'HT', 'JM', 'KI', 'LS', 'LR', 'MW', 'MV', 'ML', 'MH', 'FM', 'NA', 'NR', 'NE', 'PW', 'PG', 'WS', 'SM', 'ST', 'SN', 'SC', 'SL', 'SB', 'KN', 'LC', 'VC', 'SR', 'TL', 'TO', 'TT', 'TV', 'VU', 'AZ', 'BN', 'BI', 'KH', 'CM', 'TD', 'KM', 'GQ', 'SZ', 'GA', 'GN', 'KG', 'LA', 'MO', 'MR', 'MN', 'NP', 'RW', 'TG', 'UZ', 'ZW', 'BJ', 'MG', 'MU', 'MZ', 'AO', 'CI', 'DJ', 'ZM', 'CD', 'CG', 'IQ', 'LY', 'TJ', 'VE', 'ET', 'XK'], 'disc_number': 1, 'duration_ms': 199604, 'explicit': True, 'external_ids': {'isrc': 'USSD12100202'}, 'external_urls': {'spotify': 'https://open.spotify.com/track/3rdAz1fbUfZxYgaCviYhRo'}, 'href': 'https://api.spotify.com/v1/tracks/3rdAz1fbUfZxYgaCviYhRo', 'id': '3rdAz1fbUfZxYgaCviYhRo', 'is_local': False, 'name': 'Todo De Ti', 'popularity': 76, 'preview_url': 'https://p.scdn.co/mp3-preview/c22200b1d15945f42242d40077ce0da4fc873be1?cid=90ff0496824b4821b797ce4682ab4a8d', 'track_number': 1, 'type': 'track', 'uri': 'spotify:track:3rdAz1fbUfZxYgaCviYhRo'}}, {'added_at': '2024-02-12T21:08:23Z', 'track': {'album': {'album_type': 'single', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/7ltDVBr6mKbRvohxheJ9h1'}, 'href': 'https://api.spotify.com/v1/artists/7ltDVBr6mKbRvohxheJ9h1', 'id': '7ltDVBr6mKbRvohxheJ9h1', 'name': 'ROSALÍA', 'type': 'artist', 'uri': 'spotify:artist:7ltDVBr6mKbRvohxheJ9h1'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/1mcTU81TzQhprhouKaTkpq'}, 'href': 'https://api.spotify.com/v1/artists/1mcTU81TzQhprhouKaTkpq', 'id': '1mcTU81TzQhprhouKaTkpq', 'name': 'Rauw Alejandro', 'type': 'artist', 'uri': 'spotify:artist:1mcTU81TzQhprhouKaTkpq'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM', 'FJ', 'GM', 'GE', 'GD', 'GW', 'GY', 'HT', 'JM', 'KI', 'LS', 'LR', 'MW', 'MV', 'ML', 'MH', 'FM', 'NA', 'NR', 'NE', 'PW', 'PG', 'WS', 'SM', 'ST', 'SN', 'SC', 'SL', 'SB', 'KN', 'LC', 'VC', 'SR', 'TL', 'TO', 'TT', 'TV', 'VU', 'AZ', 'BN', 'BI', 'KH', 'CM', 'TD', 'KM', 'GQ', 'SZ', 'GA', 'GN', 'KG', 'LA', 'MO', 'MR', 'MN', 'NP', 'RW', 'TG', 'UZ', 'ZW', 'BJ', 'MG', 'MU', 'MZ', 'AO', 'CI', 'DJ', 'ZM', 'CD', 'CG', 'IQ', 'LY', 'TJ', 'VE', 'ET', 'XK'], 'external_urls': {'spotify': 'https://open.spotify.com/album/50uChhk7AKkzDKytDixjYW'}, 'href': 'https://api.spotify.com/v1/albums/50uChhk7AKkzDKytDixjYW', 'id': '50uChhk7AKkzDKytDixjYW', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b2734d6cf0d0d5e32ca4fa3a59e1', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d00001e024d6cf0d0d5e32ca4fa3a59e1', 'width': 300}, {'height': 64, 'url': 'https://i.scdn.co/image/ab67616d000048514d6cf0d0d5e32ca4fa3a59e1', 'width': 64}], 'name': 'RR', 'release_date': '2023-03-24', 'release_date_precision': 'day', 'total_tracks': 3, 'type': 'album', 'uri': 'spotify:album:50uChhk7AKkzDKytDixjYW'}, 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/7ltDVBr6mKbRvohxheJ9h1'}, 'href': 'https://api.spotify.com/v1/artists/7ltDVBr6mKbRvohxheJ9h1', 'id': '7ltDVBr6mKbRvohxheJ9h1', 'name': 'ROSALÍA', 'type': 'artist', 'uri': 'spotify:artist:7ltDVBr6mKbRvohxheJ9h1'}, {'external_urls': {'spotify': 'https://open.spotify.com/artist/1mcTU81TzQhprhouKaTkpq'}, 'href': 'https://api.spotify.com/v1/artists/1mcTU81TzQhprhouKaTkpq', 'id': '1mcTU81TzQhprhouKaTkpq', 'name': 'Rauw Alejandro', 'type': 'artist', 'uri': 'spotify:artist:1mcTU81TzQhprhouKaTkpq'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW', 'DM', 'FJ', 'GM', 'GE', 'GD', 'GW', 'GY', 'HT', 'JM', 'KI', 'LS', 'LR', 'MW', 'MV', 'ML', 'MH', 'FM', 'NA', 'NR', 'NE', 'PW', 'PG', 'WS', 'SM', 'ST', 'SN', 'SC', 'SL', 'SB', 'KN', 'LC', 'VC', 'SR', 'TL', 'TO', 'TT', 'TV', 'VU', 'AZ', 'BN', 'BI', 'KH', 'CM', 'TD', 'KM', 'GQ', 'SZ', 'GA', 'GN', 'KG', 'LA', 'MO', 'MR', 'MN', 'NP', 'RW', 'TG', 'UZ', 'ZW', 'BJ', 'MG', 'MU', 'MZ', 'AO', 'CI', 'DJ', 'ZM', 'CD', 'CG', 'IQ', 'LY', 'TJ', 'VE', 'ET', 'XK'], 'disc_number': 1, 'duration_ms': 194543, 'explicit': False, 'external_ids': {'isrc': 'USSM12301258'}, 'external_urls': {'spotify': 'https://open.spotify.com/track/609E1JCInJncactoMmkDon'}, 'href': 'https://api.spotify.com/v1/tracks/609E1JCInJncactoMmkDon', 'id': '609E1JCInJncactoMmkDon', 'is_local': False, 'name': 'BESO', 'popularity': 85, 'preview_url': 'https://p.scdn.co/mp3-preview/ec3accbe111dac19702411be5adb665d4cd44c0c?cid=90ff0496824b4821b797ce4682ab4a8d', 'track_number': 1, 'type': 'track', 'uri': 'spotify:track:609E1JCInJncactoMmkDon'}}, ...


# NUMBER OS EXTRACTED SONGS
n_tracks = saved_tracks['total']
print('Total Tracks: %d ' % n_tracks)

Total Tracks: 35


# FUNCTION TO EXTRACT ATTRIBUTES FROM SONG LIST
def select_features(track_response):
    return {        
        'id': str(track_response['track']['id']),
        'name': str(track_response['track']['name']),
        'artists': [artist['name'] for artist in track_response['track']['artists']],
        'popularity': track_response['track']['popularity']
    }


# APPLY FUNCTION
tracks = [select_features(track) for track in saved_tracks['items']]


# EXTRACTS ATTRIBUTES FROM FAVORITE SONGS
while saved_tracks['next']:
    saved_tracks = spotipy_obj.next(saved_tracks)
    tracks.extend([select_features(track) for track in saved_tracks['items']])


# CREATE PANDAS DATAFRAME
df_tracks = pd.DataFrame(tracks)
pd.set_option('display.max_rows', len(tracks))
df_tracks['artists'] = df_tracks['artists'].apply(lambda artists: artists[0])


# DISPLAY THE FIRST 10 ROWS
sorted_df_tracks = df_tracks.sort_values(by="popularity", ascending=False)
sorted_df_tracks.head(10)


# DICTIONARY FOR AUDIO FEATURES
audio_features = {}


# EXTRACTION OF AUDIO ATTRIBUTES
for idd in df_tracks['id'].tolist():
    audio_features[idd] = spotipy_obj.audio_features(idd)[0]


# ADD AUDIO ATTRIBUTES TO THE DATAFRAME
df_tracks['acousticness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['acousticness'])
df_tracks['speechiness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['speechiness'])
df_tracks['key'] = df_tracks['id'].apply(lambda idd: str(audio_features[idd]['key']))
df_tracks['liveness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['liveness'])
df_tracks['instrumentalness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['instrumentalness'])
df_tracks['energy'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['energy'])
df_tracks['tempo'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['tempo'])
df_tracks['loudness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['loudness'])
df_tracks['danceability'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['danceability'])
df_tracks['valence'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['valence'])


df_tracks.head()


# SELECT A SONG RANDOMLY
random_song = random. randint(0,len(df_tracks)-1)
df_random_song = df_tracks.head(random_song)[-1:]
df_random_song


# SPARK STREAMING SONGS 
spark_songs.show(5)

+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|order_id|                  id|      name|popularity|duration_ms|             artists|          id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|           timestamp|
+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|     101|47mAiKhmnkY9dJ2GU...|     Radio|         1|   162024.0|              Spissy|    JzReCvrdmAkxGCcT|  2018-07-10|       0.616| 0.847|  0|  -4.578|   0|     0.0308|      0.0166|         0.00338|  0.0753|   0.87|114.997|           4.0|2024-02-14 04:03:...|
|     102|13yqiEmOlVXyCJ5rD...|   Control|         0|   109956.0|          lovesadKID|   AKgMMrkCGsURNvyXs|  2018-07-19|       0.626| 0.682|  7|  -9.377|   1|      0.264|       0.133|             0.0|   0.102|  0.596| 95.111|           4.0|2024-02-14 04:03:...|
|     103|694UYYV6nOiT3rUoJ...|     Vices|        11|   150909.0|          LhasaPetik|   EtMqKRBCptLUAYQed|  2018-06-24|       0.727| 0.583|  7|  -6.187|   1|     0.0707|       0.672|             0.0|     0.1|  0.309|131.682|           4.0|2024-02-14 04:03:...|
|     104|5r8lQLxTTAhmltQXu...|Game No Mo|         0|   187531.0|         JennyPenkin| BQvdGvRDDXZtEEyELke|  2018-07-13|       0.701| 0.544|  5|  -5.949|   0|     0.0518|       0.453|          0.0957|   0.167|  0.576| 84.047|           4.0|2024-02-14 04:03:...|
|     105|4Repz6Yn1aABTFj9O...|     Money|         0|   185000.0|FinisMundiLiliann...|gAReFedlCvkPUhxIb...|  2018-07-20|       0.716| 0.543|  9|  -8.972|   0|      0.191|       0.737|         2.13E-6|   0.534|  0.688|107.955|           4.0|2024-02-14 04:03:...|
+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
only showing top 5 rows


# WE CAN DROP THESE COLUMNS
spark_songs = spark_songs.drop('order_id', 
                               'mode', 
                               'release_date', 
                               'id_artists',
                               'time_signature', 
                               'duration_ms',
                               'timestamp')


# CREATE DATASET WITH THE RANDOM SONG
df_sp = spark.createDataFrame(df_random_song)


# CONCATENATES STREAMING SONGS WITH SPOTIFY SONGS
df = spark_songs.union(df_sp)


df.show(5)

+--------------------+----------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|                  id|      name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+----------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|47mAiKhmnkY9dJ2GU...|     Radio|         1|              Spissy|       0.616| 0.847|  0|  -4.578|     0.0308|      0.0166|         0.00338|  0.0753|   0.87|114.997|
|13yqiEmOlVXyCJ5rD...|   Control|         0|          lovesadKID|       0.626| 0.682|  7|  -9.377|      0.264|       0.133|             0.0|   0.102|  0.596| 95.111|
|694UYYV6nOiT3rUoJ...|     Vices|        11|          LhasaPetik|       0.727| 0.583|  7|  -6.187|     0.0707|       0.672|             0.0|     0.1|  0.309|131.682|
|5r8lQLxTTAhmltQXu...|Game No Mo|         0|         JennyPenkin|       0.701| 0.544|  5|  -5.949|     0.0518|       0.453|          0.0957|   0.167|  0.576| 84.047|
|4Repz6Yn1aABTFj9O...|     Money|         0|FinisMundiLiliann...|       0.716| 0.543|  9|  -8.972|      0.191|       0.737|         2.13E-6|   0.534|  0.688|107.955|
+--------------------+----------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
only showing top 5 rows


# PREPARING THE VECTOR ASSEMBLER
vector = VectorAssembler(inputCols = ['danceability',
                                     'energy',
                                     'loudness',
                                     'speechiness',
                                     'acousticness',
                                     'instrumentalness',
                                     'liveness',
                                     'valence',
                                     'tempo'], 
                        outputCol = 'song_features')


# Descartamos valores inválidos
assembled = vector.setHandleInvalid("skip").transform(df)


# Preparamos o padronizador
std = StandardScaler(inputCol = 'song_features', outputCol = 'standardized')


# Treinamos o padronizador
scale = std.fit(assembled)


# Dataframe com dados padronizados
df = scale.transform(assembled)


df.show(5)

+--------------------+----------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
|                  id|      name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|       song_features|        standardized|
+--------------------+----------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
|47mAiKhmnkY9dJ2GU...|     Radio|         1|              Spissy|       0.616| 0.847|  0|  -4.578|     0.0308|      0.0166|         0.00338|  0.0753|   0.87|114.997|[0.616,0.847,-4.5...|[3.60938138365851...|
|13yqiEmOlVXyCJ5rD...|   Control|         0|          lovesadKID|       0.626| 0.682|  7|  -9.377|      0.264|       0.133|             0.0|   0.102|  0.596| 95.111|[0.626,0.682,-9.3...|[3.66797523728933...|
|694UYYV6nOiT3rUoJ...|     Vices|        11|          LhasaPetik|       0.727| 0.583|  7|  -6.187|     0.0707|       0.672|             0.0|     0.1|  0.309|131.682|[0.727,0.583,-6.1...|[4.25977315896061...|
|5r8lQLxTTAhmltQXu...|Game No Mo|         0|         JennyPenkin|       0.701| 0.544|  5|  -5.949|     0.0518|       0.453|          0.0957|   0.167|  0.576| 84.047|[0.701,0.544,-5.9...|[4.10742913952048...|
|4Repz6Yn1aABTFj9O...|     Money|         0|FinisMundiLiliann...|       0.716| 0.543|  9|  -8.972|      0.191|       0.737|         2.13E-6|   0.534|  0.688|107.955|[0.716,0.543,-8.9...|[4.19531991996671...|
+--------------------+----------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
only showing top 5 rows


# LET'S CREATE THE MODEL...
object_KMeans = KMeans(featuresCol = 'standardized', k = 3)


# ...AND TRAIN IT
model_KMeans = object_KMeans.fit(df)


df_output = model_KMeans.transform(df)


# CLASS
class RecoSystem():
    
    def __init__(self, data):
        self.data_ = data
    
    def Recomm(self, song_name, amount = 1):
        
        distances = []
        
        song = self.data_[(self.data_.name.str.lower() == song_name.lower())].head(1).values[0]
        res_dt = self.data_[self.data_.name.str.lower() != song_name.lower()]
        
        for i_song in tqdm(res_dt.values):
            
            distance = 0
            
            for col in np.arange(len(res_dt.columns)):
                if not col in [0,1,2,14]:
                    distance = distance + np.absolute(float(song[col]) - float(i_song[col]))
            
            distances.append(distance)
        
        res_dt['distance'] = distances
        res_dt = res_dt.sort_values('distance')
        
        columns = ['id','name', 
                   'artists', 
                   'acousticness', 
                   'liveness', 
                   'instrumentalness', 
                   'energy', 
                   'danceability', 
                   'valence']
        
        return res_dt[columns][:amount]


# COLUMN NAMING
datalabel = df_output.select('id',
                             'name',
                             'artists',
                             'danceability',
                             'energy',
                             'key',
                             'loudness',
                             'speechiness',
                             'acousticness',
                             'instrumentalness',
                             'liveness',
                             'valence',
                             'tempo',
                             'prediction')


# FINAL DATASET
df_final = datalabel.toPandas()
df_final.drop(df_final[df_final['artists'] == '0'].index, inplace = True)
df_final.drop_duplicates(inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['liveness'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['instrumentalness'] == 0.000000].index, inplace = True)
df_final.drop(df_final[df_final['energy'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['valence'] == 0.000].index, inplace = True)


df_final.shape

(819, 14)


df_final.sample(5)


# CREATE OBJECT
reco_obj = RecoSystem(df_final)


song = df_random_song['name'].tolist()[0]


print(song)

DJ Got Us Fallin' In Love (feat. Pitbull)


# EXECUTE RECOMMENDATION
recommendation = reco_obj.Recomm(song)

100%|█████████████████| 818/818 [00:00<00:00, 45165.94it/s]


# EXTRACTS A RANDOM SONG FROM SPOTIFY'S FAVORITES
y = df_random_song[['id','name', 
                         'artists',  
                         'acousticness', 
                         'liveness', 
                         'instrumentalness', 
                         'energy', 
                         'danceability', 
                         'valence']]


# CONCATENATES THE RECOMMENDED SONG WITH SPOTIFY
recommendation = pd.concat([recommendation, y])


# SAVES THE RECOMMENDATION
recommendation.to_csv('recommendations/recommendation.csv')


# LOADS THE FILE
df_reco = (spark.read.format("csv").options(header = "true").load("recommendations/recommendation.csv"))


# SONGS RECOMMENDATION
df_reco.show()

+---+--------------------+--------------------+----------+------------+--------+----------------+------+------------+-------+
|_c0|                  id|                name|   artists|acousticness|liveness|instrumentalness|energy|danceability|valence|
+---+--------------------+--------------------+----------+------------+--------+----------------+------+------------+-------+
|832|7khpPruHJK39VTBUQ...|            Stranger|MildOrange|       0.412|   0.109|           0.113| 0.491|       0.334|  0.452|
|  8|4356Typ82hUiFAynb...|DJ Got Us Fallin'...|     USHER|      0.0338|   0.082|             0.0| 0.861|       0.663|  0.654|
+---+--------------------+--------------------+----------+------------+--------+----------------+------+------------+-------+

	id	name	artists	popularity	acousticness	speechiness	key	liveness	instrumentalness	energy	tempo	loudness	danceability	valence
0	3rdAz1fbUfZxYgaCviYhRo	Todo De Ti	Rauw Alejandro	76	0.302	0.0506	3	0.0931	0.000196	0.719	127.962	-3.613	0.780	0.3360
1	609E1JCInJncactoMmkDon	BESO	ROSALÍA	85	0.736	0.1360	5	0.1730	0.000837	0.644	95.050	-6.671	0.768	0.5300
2	1ODFVLQszq0hCOdZtqV5wq	MR. OCTOBER	Bad Bunny	79	0.188	0.1720	8	0.1340	0.000020	0.612	126.013	-5.682	0.805	0.4250
3	4Jc7252S1P99DjQ1lNGEOc	CYBERTRUCK	Bad Bunny	80	0.371	0.3380	6	0.1070	0.000002	0.905	151.823	-4.948	0.704	0.0991
4	2yzshFeBIwH8tWIqHEFLeD	un x100to	Grupo Frontera	87	0.213	0.0458	9	0.2710	0.000000	0.720	83.827	-4.089	0.571	0.5420

	id	name	artists	danceability	energy	key	loudness	speechiness	acousticness	instrumentalness	liveness	valence	tempo	prediction
824	6BiN4kyZStu4o1wRxKCKtu	Aoba	shogonodo	0.798	0.198	8	-12.210	0.2350	0.6540	0.012300	0.122	0.222	140.233	1
299	6iVrHS4tsNTgxXt8sF514Q	Tonight Show	SkinMag	0.487	0.622	2	-10.618	0.0281	0.0116	0.000226	0.131	0.487	100.025	0
27	5yJEnwrgNY0ysEqRC3MG0n	Drivin'	Boyhood	0.518	0.666	4	-4.818	0.0252	0.4500	0.007760	0.102	0.434	91.412	0
825	1PtGazJZSxqZ1GiPTl29wA	girl in new york	ROLEMODEL	0.625	0.573	9	-7.401	0.0304	0.0332	0.323000	0.115	0.096	92.035	0
626	1ZGvS5RYbOfe6aV80K8PxM	for ever	landscape	0.159	0.244	7	-22.199	0.0416	0.9700	0.404000	0.114	0.257	135.191	1

REAL-TIME KAFKA - RECOMMENDATION SYSTEM (MUSIC) ¶

Big Data Real-Time Recommendations with Python, Apache Spark and Apache Kafka¶

TABLE OF CONTENTS¶

SUMMARY - ABSTRACT ¶

Return to Index ¶

OBJECTIVE: Create a Recommendation System using Apache Kafka and PySpark.¶

USED TECHNOLOGIES¶

1. THE BUSINESS PROBLEM & DATA ARCHITECTURE ¶

Return to Index ¶

Seems like a difficult job to pull off, right?
Well, I wouldn't say it's easy, but bare with me:

Let's take a look into how we will conduct the Data Architecture / Structure:¶

2. DOCKER & KAFKA SETUP ¶

Return to Index ¶

3. APACHE KAFKA - PRODUCER ¶

Return to Index ¶

3.1. PRODUCER - Stream Start ¶

Return to Index ¶

4. APACHE KAFKA - CONSUMER ¶

Return to Index ¶

4.1. Spotify Developer's App ¶

Return to Index ¶

4.2. Song Extraction ¶

Return to Index ¶

4.3. Data Pre Processing ¶

Return to Index ¶

4.4. Non Supervised Machine Learning ¶

Return to Index ¶

4.5. Recommendation System ¶

Return to Index ¶

Final remarks here before our last line of code.

5. ENDING & FINAL REMARKS ¶

Return to Index ¶

CONTACT INFO:¶

	Spotify ID	Artist IDs	Track Name	Album Name	Artist Name(s)	Release Date	Duration (ms)	Popularity	Added By	Added At	...	Loudness	Mode	Speechiness	Acousticness	Instrumentalness	Liveness	Valence	Tempo	Time Signature	order_id
0	22a0Ji6EQKkY0tBohlN4Od	qLyYYhSlsjwymwVKwW	There You Are	There You Are	KirstenLudwig	2018-08-06	231240	2	spotify:user:predict0	2018-08-28T19:51:58Z	...	-5.596	0	0.0304	0.334000	0.282000	0.1050	0.316	129.856	4	0
1	4J39ZEbwqHwtWLImUKmrn9	CRfAxYjJsDBHwvWFnjaRRRPXwFwQmoTNqNHBGU	88 Days	Heat	SaraKingIanOlney	2018-08-04	227961	8	spotify:user:predict0	2018-08-28T19:51:58Z	...	-10.749	1	0.0333	0.134000	0.582000	0.1340	0.233	155.062	4	1
2	0a12d4HUjOmQSqHqLopWYx	hytHTGTflktWAhKcxQ	Castaway	Castaway	ARZLEE	2018-08-10	230000	0	spotify:user:predict0	2018-08-28T19:51:58Z	...	-11.290	1	0.0314	0.110000	0.000032	0.1190	0.290	83.988	4	2
3	4u1DykFW1HjYAGNoDCiXfC	WjyoJHRHlTbUTZTwqpAgeqmtJlARXjon	Arouse	Arouse	Shagabondgoodboynoah	2018-08-03	213913	30	spotify:user:predict0	2018-08-28T19:51:58Z	...	-6.066	1	0.4330	0.072800	0.000000	0.3680	0.533	91.961	4	3
4	0u7JZm9ORerlZnnxxSdMwl	AdKmjgFzpcTvmVfGwR	Lonely	Lonely	Hayleau	2018-08-10	258738	21	spotify:user:predict0	2018-08-28T19:51:58Z	...	-3.921	0	0.0406	0.016900	0.000630	0.0542	0.577	98.954	4	4
5	0wuy2BYIVLbflFDqnR9Jay	kCwrYUFSJCubbbnZrE	Orsay	Strange Affairs	TheSvens	2018-08-03	413658	6	spotify:user:predict0	2018-08-28T19:51:58Z	...	-11.858	1	0.0316	0.048600	0.886000	0.1280	0.283	122.992	4	5
6	6LkIZZRrPQIbHMyBR5mTc2	TOsWuafqeWtrvYXqbnYAV	Nurture	Comrade	IslandFox	2018-08-09	191641	0	spotify:user:predict0	2018-08-28T19:51:58Z	...	-11.415	0	0.0504	0.015900	0.639000	0.1810	0.266	133.925	4	6
7	5U27fxNSd27XtX876xUsfV	HsKUExgNcRJojPmBcNqzgwpvzedAIjuDBM	Dinosaur Hair - Remix	Dinosaur Hair	AndyFerroCharlieConway	2018-08-10	257152	1	spotify:user:predict0	2018-08-28T19:51:58Z	...	-10.086	1	0.0383	0.456000	0.920000	0.1350	0.663	125.908	4	7
8	5ogJOpmyDsvrAdttU6JLnN	gslbnQQLLcNzfjnxQY	Breathing Underwater	Long Way Down	MorningWars	2018-08-03	174999	0	spotify:user:predict0	2018-08-28T19:51:58Z	...	-11.258	0	0.0461	0.000005	0.020400	0.1150	0.477	150.042	4	8
9	65rLHt6A58MFRxlNWVDU1Z	WlYiRrlrChWktQDo	Summer	Summer	NoSo	2018-08-01	232746	22	spotify:user:predict0	2018-08-28T19:51:58Z	...	-7.517	1	0.0380	0.016800	0.007230	0.0706	0.210	123.962	4	9

	id	name	artists	popularity
5	4MjDJD8cW7iVeWInc2Bdyj	MONACO	Bad Bunny	93
24	0KKkJNfGyhkQ5aFogxQAPU	That's What I Like	Bruno Mars	90
32	7qiZfU4dY1lWllzX7mPBI3	Shape of You	Ed Sheeran	89
17	5Y6nVaayzitvsD5F7nr3DV	West Coast	Lana Del Rey	87
4	2yzshFeBIwH8tWIqHEFLeD	un x100to	Grupo Frontera	87
6	5rb9QrpfcKFHM1EUbSIurX	Yeah! (feat. Lil Jon & Ludacris)	USHER	87
33	34gCuhDGsG4bRPIf9bb02f	Thinking out Loud	Ed Sheeran	85
30	3rmo8F54jFF8OgYsqTxm5d	Bad Habits	Ed Sheeran	85
8	4356Typ82hUiFAynbLYbPn	DJ Got Us Fallin' In Love (feat. Pitbull)	USHER	85
1	609E1JCInJncactoMmkDon	BESO	ROSALÍA	85

**REAL-TIME KAFKA - RECOMMENDATION SYSTEM (MUSIC)** ¶

**Big Data Real-Time Recommendations with Python, Apache Spark and Apache Kafka**¶

TABLE OF CONTENTS¶

SUMMARY - ABSTRACT ¶

OBJECTIVE: Create a Recommendation System using Apache Kafka and PySpark.¶

USED TECHNOLOGIES¶

1. THE BUSINESS PROBLEM & DATA ARCHITECTURE ¶

Seems like a difficult job to pull off, right? Well, I wouldn't say it's easy, but bare with me:

Let's take a look into how we will conduct the Data Architecture / Structure:¶

2. DOCKER & KAFKA SETUP ¶

3. APACHE KAFKA - PRODUCER ¶

3.1. PRODUCER - Stream Start ¶

4. APACHE KAFKA - CONSUMER ¶

4.1. Spotify Developer's App ¶

4.2. Song Extraction ¶

4.3. Data Pre Processing ¶

4.4. Non Supervised Machine Learning ¶

4.5. Recommendation System ¶

Final remarks here before our last line of code.

5. ENDING & FINAL REMARKS ¶

CONTACT INFO:¶

REAL-TIME KAFKA - RECOMMENDATION SYSTEM (MUSIC) ¶

Big Data Real-Time Recommendations with Python, Apache Spark and Apache Kafka¶

Seems like a difficult job to pull off, right?
Well, I wouldn't say it's easy, but bare with me: