machine-learning/homework_03_kmeans/homework/3-3.py

'''
Author: SJ2050
Date: 2021-11-07 15:13:40
LastEditTime: 2021-11-07 21:14:18
Version: v0.0.1
Description: Use a density-based clustering algorithm.
Copyright © 2021 SJ2050
'''
import numpy as np
import random
from basic_kMeans import read_csv_data, show_figure

def compute_distances(points):
    """Compute the distances between every two points.

    Args：
        points: All points.

    Returns:
        distances: The distances between every two points.

    """
    points = np.array(points, dtype=np.float64)
    n = len(points)
    distances = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            if i <= j:
                distances[i, j] = np.linalg.norm(points[i]-points[j])
            else:
                distances[i, j] = distances[j, i]

    return distances

def find_neighbors(distances, p_ind, eps):
    """Find neighbors of point p.

    When the distance between two points is less than eps, they are considered as neighbors.

    Args:
        distances: The distances between every two points (numpy 2d array).
        p_ind： Reference point's index.
        eps: The distance between two neighbors should be less than eps.

    Returns:
        neighbors: A set of the reference point's neighbors' indices.
    """
    neighbors = {i for i in range(distances.shape[0]) if distances[p_ind][i] < eps and p_ind != i}

    return neighbors

def DBSCAN(points, eps, min_Pts):
    """A density-based clustering algorithm

    Args:
        points: All points.
        eps: The distance between two neighbors should be less than eps.
        min_Pts: Each core object should at least have `min_Pts` neighbors.

    Returns:
        clusters: Clusters of given points.
    """
    # initialization
    distances = compute_distances(points)
    each_point_neighbors = [find_neighbors(distances, i, eps) for i in range(len(points))]
    core_objects = {i for i in range(len(points)) \
                        if len(each_point_neighbors[i]) >= min_Pts}
    unvisited_indices = {i for i in range(len(points))}

    clusters = []
    while len(core_objects) > 0:
        unvisited_indices_old = unvisited_indices.copy()
        core_obj = random.choice(list(core_objects))
        Q = [core_obj]
        unvisited_indices = unvisited_indices - {core_obj}

        while len(Q) > 0:
            q = Q.pop(0)
            Nq = each_point_neighbors[q]
            if len(Nq) >= min_Pts:
                delta = Nq & unvisited_indices
                Q.extend(list(delta))
                unvisited_indices = unvisited_indices - delta

        Ck = unvisited_indices_old - unvisited_indices
        clusters.append(Ck)
        core_objects = core_objects - Ck

    clusters.append(unvisited_indices)
    return clusters

if __name__ == '__main__':
    data_file = 'dataset_circles.csv'
    original_data = read_csv_data(data_file)
    points = original_data[:, :2]
    eps = 5
    min_Pts = 5

    clusters = DBSCAN(points, eps, min_Pts)
    classified_points = [(points[j][0], points[j][1], i) \
                            for i in range(len(clusters)) \
                                for j in clusters[i]]
    show_figure(classified_points, [])