mirror of
https://github.com/fenago/data-science.git
synced 2026-05-05 00:51:50 +00:00
59 lines
3.3 KiB
Python
59 lines
3.3 KiB
Python
import unittest
|
|
import import_ipynb
|
|
import pandas as pd
|
|
import pandas.testing as pd_testing
|
|
from sklearn.cluster import KMeans
|
|
import random
|
|
|
|
class Test(unittest.TestCase):
|
|
def setUp(self):
|
|
import Exercise5_4
|
|
self.exercises = Exercise5_4
|
|
|
|
self.file_url = '../DataSet/taxstats2015.csv'
|
|
self.df = pd.read_csv(self.file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])
|
|
self.X = self.df[['Average total business income', 'Average total business expenses']]
|
|
self.business_income_min = self.df['Average total business income'].min()
|
|
self.business_income_max = self.df['Average total business income'].max()
|
|
self.business_expenses_min = self.df['Average total business expenses'].min()
|
|
self.business_expenses_max = self.df['Average total business expenses'].max()
|
|
random.seed(42)
|
|
self.centroids = pd.DataFrame()
|
|
|
|
self.centroids['Average total business income'] = random.sample(range(self.business_income_min, self.business_income_max), 4)
|
|
self.centroids['Average total business expenses'] = random.sample(range(self.business_expenses_min, self.business_expenses_max), 4)
|
|
self.centroids['cluster'] = self.centroids.index
|
|
|
|
def squared_euclidean(data_x, data_y, centroid_x, centroid_y, ):
|
|
return (data_x - centroid_x)**2 + (data_y - centroid_y)**2
|
|
|
|
self.data_x = self.df.at[0, 'Average total business income']
|
|
self.data_y = self.df.at[0, 'Average total business expenses']
|
|
self.distances = [squared_euclidean(self.data_x, self.data_y, self.centroids.at[i, 'Average total business income'], self.centroids.at[i, 'Average total business expenses']) for i in range(4)]
|
|
self.cluster_index = self.distances.index(min(self.distances))
|
|
self.df.at[0, 'cluster'] = self.cluster_index
|
|
self.distances = [squared_euclidean(self.df.at[1, 'Average total business income'], self.df.at[1, 'Average total business expenses'], self.centroids.at[i, 'Average total business income'], self.centroids.at[i, 'Average total business expenses']) for i in range(4)]
|
|
self.df.at[1, 'cluster'] = self.distances.index(min(self.distances))
|
|
|
|
self.distances = [squared_euclidean(self.df.at[2, 'Average total business income'], self.df.at[2, 'Average total business expenses'], self.centroids.at[i, 'Average total business income'], self.centroids.at[i, 'Average total business expenses']) for i in range(4)]
|
|
self.df.at[2, 'cluster'] = self.distances.index(min(self.distances))
|
|
|
|
self.distances = [squared_euclidean(self.df.at[3, 'Average total business income'], self.df.at[3, 'Average total business expenses'], self.centroids.at[i, 'Average total business income'], self.centroids.at[i, 'Average total business expenses']) for i in range(4)]
|
|
self.df.at[3, 'cluster'] = self.distances.index(min(self.distances))
|
|
|
|
self.distances = [squared_euclidean(self.df.at[4, 'Average total business income'], self.df.at[4, 'Average total business expenses'], self.centroids.at[i, 'Average total business income'], self.centroids.at[i, 'Average total business expenses']) for i in range(4)]
|
|
self.df.at[4, 'cluster'] = self.distances.index(min(self.distances))
|
|
|
|
def test_file_url(self):
|
|
self.assertEqual(self.exercises.file_url, self.file_url)
|
|
|
|
def test_df(self):
|
|
pd_testing.assert_frame_equal(self.exercises.df, self.df)
|
|
|
|
def test_distances(self):
|
|
self.assertCountEqual(self.exercises.distances, self.distances)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|