## TF-IDF Vectorization and clustering

In [7]:
## importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import os

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to C:\Users\Akhil
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Akhil
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = [word for word in text.split() if word not in stop_words]
    text = [stemmer.stem(word) for word in text] 
    text = [lemmatizer.lemmatize(word) for word in text]
    return ' '.join(text)

def clean_data(df):
    df['Map Data'] = df['Map Data'].fillna('')
    df = df[df['Map Data'].str.len() > 0]
    df = df[df['Map Data'].str.len() < 5000]
    # df['Map Data'] = df['Map Data'].apply(clean_text)
    return df

In [13]:
data_folder = os.path.join(os.path.dirname(os.getcwd()), 'data')
data_file = os.path.join(data_folder, 'MMR_DATA.csv')
df = pd.read_csv(data_file)
df.head()

Unnamed: 0,row,col,latitude,longitude,Map Data
0,0,0,18.89433,72.784597,
1,0,1,18.89433,72.794102,Prongs Reef is a Natural;
2,0,2,18.89433,72.803607,United Services Club Golf Course is a Leisure ...
3,0,3,18.89433,72.813112,Indian Meterological Department is a Commercia...
4,0,4,18.89433,72.822617,


In [14]:
df_clean = clean_data(df)
print(len(df_clean))
df_clean.head()

791


Unnamed: 0,row,col,latitude,longitude,Map Data
1,0,1,18.89433,72.794102,Prongs Reef is a Natural;
2,0,2,18.89433,72.803607,United Services Club Golf Course is a Leisure ...
3,0,3,18.89433,72.813112,Indian Meterological Department is a Commercia...
13,0,13,18.89433,72.908163,Uran Naval Base is a Landuse: Military;
14,0,14,18.89433,72.917668,Uran Naval Base is a Landuse: Military;


In [15]:
## TF-IDF Vectorization

X = df_clean['Map Data']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [16]:
vectorizer.vocabulary_

{'prongs': 5378,
 'reef': 5609,
 'is': 3129,
 'natural': 4614,
 'united': 7179,
 'services': 6144,
 'club': 1566,
 'golf': 2630,
 'course': 1676,
 'leisure': 3786,
 'entertainment': 2192,
 'defence': 1844,
 'station': 6603,
 'colaba': 1581,
 'landuse': 3746,
 'military': 4300,
 'kendriya': 3516,
 'vidyalaya': 7355,
 'no': 4776,
 'educational': 2118,
 'mumbai': 4483,
 'shoal': 6315,
 'indian': 3058,
 'meterological': 4263,
 'department': 1864,
 'commercial': 1602,
 'naval': 4624,
 'maritime': 4113,
 'academy': 206,
 'soma': 6502,
 'hospital': 2965,
 'healthcare': 2862,
 'agastya': 264,
 'building': 1221,
 'residential': 5660,
 'vatsala': 7289,
 'vishaka': 7409,
 'vandhana': 7259,
 'army': 583,
 'public': 5390,
 'school': 6096,
 'institute': 3101,
 'of': 4832,
 'geomagnetism': 2544,
 'inhs': 3083,
 'asvini': 653,
 'uran': 7197,
 'base': 886,
 'dumping': 2071,
 'ground': 2708,
 'landfill': 3743,
 'dongri': 2031,
 'funde': 2423,
 'jnpt': 3306,
 'township': 7041,
 'cidco': 1517,
 'nhava': 4

In [17]:
X.shape

(791, 7726)

In [18]:
for word in vectorizer.vocabulary_:
    idx = vectorizer.vocabulary_[word]
    print(f'{word}: {vectorizer.idf_[idx]}')

prongs: 6.0651234793803255
reef: 5.477336814478207
is: 1.0
natural: 1.7529829721706105
united: 4.6300389540910025
services: 2.5625736034578823
club: 3.7233176732329984
golf: 4.966511190712216
course: 5.276666119016055
leisure: 1.4842459859612787
entertainment: 1.4842459859612787
defence: 5.7286512427591125
station: 2.670615085868967
colaba: 5.276666119016055
landuse: 1.8252366118675667
military: 3.6315101239798757
kendriya: 5.276666119016055
vidyalaya: 4.063643479170201
no: 3.531426665422893
educational: 1.9156596179371452
mumbai: 2.3132692261050005
shoal: 5.882801922586371
indian: 3.9133612761208636
meterological: 6.575949103146316
department: 5.59511985013459
commercial: 1.742316204366415
naval: 4.455685566946225
maritime: 5.882801922586371
academy: 4.378724525810097
soma: 6.575949103146316
hospital: 1.9707789171582248
healthcare: 1.8252366118675667
agastya: 5.882801922586371
building: 3.2086532731598423
residential: 1.547692207700241
vatsala: 6.575949103146316
vishaka: 6.98141421125

In [41]:
## clustering using HDBSACN and UMAP

from sklearn.cluster import HDBSCAN, KMeans, DBSCAN
import umap.umap_ as umap
# from umap import UMAP

k_means = HDBSCAN(min_cluster_size=50)
X_k = k_means.fit(X.toarray())

In [42]:
set(X_k.labels_)

{-1}

In [39]:
umap = umap.UMAP(n_components=3)
X_umap = umap.fit_transform(X.toarray())

In [40]:
## plotting the 3d clusters

import plotly

import plotly.graph_objs as go

fig = go.Figure(data=[go.Scatter3d(x=X_umap[:,0], y=X_umap[:,1], z=X_umap[:,2], mode='markers', marker=dict(size=5, color=X_k.labels_, colorscale='Viridis', opacity=0.5))])
fig.show()

Naaah, TFIDF wont work here unless its some extensive feature engineering....