K-Means#
The K-Means algorithm solves clustering problem by partitioning
Operation |
Computational methods |
Programming Interface |
||
Mathematical formulation#
Training#
Given the training set
where
Expression
Note
In the general case,
Training method: Lloyd’s#
The Lloyd’s method [Lloyd82] consists in iterative updates of centroids by
applying the alternating Assignment and Update steps, where
(1) Assignment step: Assign each feature vector
Each feature vector from the training set
(2) Update step: Recalculate centroids by averaging feature vectors assigned to each cluster.
The steps (1) and (2) are performed until the following stop condition,
is satisfied or number of iterations exceeds the maximal value
Inference#
Given the inference set
Inference method: Lloyd’s#
Lloyd’s inference method computes the
Usage example#
Training#
kmeans::model<> run_training(const table& data,
const table& initial_centroids) {
const auto kmeans_desc = kmeans::descriptor<float>{}
.set_cluster_count(10)
.set_max_iteration_count(50)
.set_accuracy_threshold(1e-4);
const auto result = train(kmeans_desc, data, initial_centroids);
print_table("labels", result.get_labels());
print_table("centroids", result.get_model().get_centroids());
print_value("objective", result.get_objective_function_value());
return result.get_model();
}
Inference#
table run_inference(const kmeans::model<>& model,
const table& new_data) {
const auto kmeans_desc = kmeans::descriptor<float>{}
.set_cluster_count(model.get_cluster_count());
const auto result = infer(kmeans_desc, model, new_data);
print_table("labels", result.get_labels());
}
Programming Interface#
All types and functions in this section shall be declared in the
oneapi::dal::kmeans
namespace and be available via inclusion of the
oneapi/dal/algo/kmeans.hpp
header file.
Descriptor#
template <typename Float = float,
typename Method = method::by_default,
typename Task = task::by_default>
class descriptor {
public:
explicit descriptor(std::int64_t cluster_count = 2);
int64_t get_cluster_count() const;
descriptor& set_cluster_count(int64_t);
int64_t get_max_iteration_count() const;
descriptor& set_max_iteration_count(int64_t);
double get_accuracy_threshold() const;
descriptor& set_accuracy_threshold(double);
};
-
template<typename Float = float, typename Method = method::by_default, typename Task = task::by_default>
class descriptor# - Template Parameters:
Float – The floating-point type that the algorithm uses for intermediate computations. Can be float or double.
Method – Tag-type that specifies an implementation of algorithm. Can be method::lloyd.
Task – Tag-type that specifies the type of the problem to solve. Can be task::clustering.
Constructors
-
descriptor(std::int64_t cluster_count = 2)#
Creates a new instance of the class with the given
cluster_count
.
Properties
-
int64_t max_iteration_count#
The maximum number of iterations
. Default value: 100.- Getter & Setter
int64_t get_max_iteration_count() const
descriptor & set_max_iteration_count(int64_t)
- Invariants
- max_iteration_count >= 0
-
int64_t cluster_count#
The number of clusters
. Default value: 2.- Getter & Setter
int64_t get_cluster_count() const
descriptor & set_cluster_count(int64_t)
- Invariants
- cluster_count > 0
-
double accuracy_threshold#
The threshold
for the stop condition. Default value: 0.0.- Getter & Setter
double get_accuracy_threshold() const
descriptor & set_accuracy_threshold(double)
- Invariants
- accuracy_threshold >= 0.0
Model#
template <typename Task = task::by_default>
class model {
public:
model();
const table& get_centroids() const;
int64_t get_cluster_count() const;
};
-
template<typename Task = task::by_default>
class model# - Template Parameters:
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
Constructors
-
model()#
Creates a new instance of the class with the default property values.
Public Methods
-
const table &get_centroids() const#
A
table with the cluster centroids. Each row of the table stores one centroid.
-
int64_t get_cluster_count() const#
Number of clusters
in the trained model.
Training train(...)#
Input#
template <typename Task = task::by_default>
class train_input {
public:
train_input(const table& data = table{},
const table& initial_centroids = table{});
const table& get_data() const;
train_input& set_data(const table&);
const table& get_initial_centroids() const;
train_input& set_initial_centroids(const table&);
};
-
template<typename Task = task::by_default>
class train_input# - Template Parameters:
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
Constructors
-
train_input(const table &data = table{}, const table &initial_centroids = table{})#
Creates a new instance of the class with the given
data
andinitial_centroids
.
Properties
Result#
template <typename Task = task::by_default>
class train_result {
public:
train_result();
const model<Task>& get_model() const;
const table& get_labels() const;
int64_t get_iteration_count() const;
double get_objective_function_value() const;
};
-
template<typename Task = task::by_default>
class train_result# - Template Parameters:
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
Constructors
-
train_result()#
Creates a new instance of the class with the default property values.
Public Methods
-
const table &get_labels() const#
An
table with the labels assigned to the samples in the input data, .
-
int64_t get_iteration_count() const#
The number of iterations performed by the algorithm.
Operation#
template <typename Float, typename Method, typename Task>
train_result<Task> train(const descriptor<Float, Method, Task>& desc,
const train_input<Task>& input);
-
template<typename Float, typename Method, typename Task>
train_result<Task> train(const descriptor<Float, Method, Task> &desc, const train_input<Task> &input)# Runs the training operation for K-Means clustering. For more details see oneapi::dal::train.
- Template Parameters:
Float – The floating-point type that the algorithm uses for intermediate computations. Can be float or double.
Method – Tag-type that specifies an implementation of algorithm. Can be method::lloyd.
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
- Parameters:
desc – Descriptor of the algorithm.
input – Input data for the training operation.
- Preconditions
- Postconditions
- result.labels.row_count == input.data.row_countresult.labels.column_count == 1result.labels[i] >= 0result.labels[i] < desc.cluster_countresult.iteration_count <= desc.max_iteration_countresult.model.centroids.row_count == desc.cluster_countresult.model.centroids.column_count == input.data.column_count
Inference infer(...)#
Input#
template <typename Task = task::by_default>
class infer_input {
public:
infer_input(const model<Task>& m = model<Task>{},
const table& data = table{});
const model<Task>& get_model() const;
infer_input& set_model(const model<Task>&);
const table& get_data() const;
infer_input& set_data(const table&);
};
-
template<typename Task = task::by_default>
class infer_input# - Template Parameters:
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
Constructors
-
infer_input(const model<Task> &m = model<Task>{}, const table &data = table{})#
Creates a new instance of the class with the given
model
anddata
.
Properties
Result#
template <typename Task = task::by_default>
class infer_result {
public:
infer_result();
const table& get_labels() const;
double get_objective_function_value() const;
};
-
template<typename Task = task::by_default>
class infer_result# - Template Parameters:
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
Constructors
-
infer_result()#
Creates a new instance of the class with the default property values.
Public Methods
-
const table &get_labels() const#
An
table with assignments labels to feature vectors in the input data.
-
double get_objective_function_value() const#
The value of the objective function
, where is defined by the corresponding infer_input::model::centroids.
Operation#
template <typename Float, typename Method, typename Task>
infer_result<Task> infer(const descriptor<Float, Method, Task>& desc,
const infer_input<Task>& input);
-
template<typename Float, typename Method, typename Task>
infer_result<Task> infer(const descriptor<Float, Method, Task> &desc, const infer_input<Task> &input)# Runs the inference operation for K-Means clustering. For more details see oneapi::dal::infer.
- Template Parameters:
Float – The floating-point type that the algorithm uses for intermediate computations. Can be float or double.
Method – Tag-type that specifies an implementation of algorithm. Can be method::lloyd.
Task – Tag-type that specifies type of the problem to solve. Can be task::clustering.
- Parameters:
desc – Descriptor of the algorithm.
input – Input data for the inference operation.
- Preconditions
- Postconditions