uxlfoundation · lordoz234 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021 · Dec 22, 2021
@@ -62,8 +62,9 @@ Status KMeansBatchKernel<method, algorithmFPType, cpu>::compute(const NumericTab
     DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, p, sizeof(algorithmFPType));
 
     TArray<int, cpu> clusterS0(nClusters);
+    TArray<int, cpu> clusterS2(nClusters);
     TArray<algorithmFPType, cpu> clusterS1(nClusters * p);
-    DAAL_CHECK(clusterS0.get() && clusterS1.get(), services::ErrorMemoryAllocationFailed);
+    DAAL_CHECK(clusterS0.get() && clusterS2.get() && clusterS1.get(), services::ErrorMemoryAllocationFailed);
 
     /* Categorial variables check and support: begin */
     int catFlag = 0;
@@ -143,7 +144,7 @@ Status KMeansBatchKernel<method, algorithmFPType, cpu>::compute(const NumericTab
 
     for (kIter = 0; kIter < nIter; kIter++)
     {
-        auto task = TaskKMeansLloyd<algorithmFPType, cpu>::create(p, nClusters, inClusters, blockSize);
+        auto task = TaskKMeansLloyd<algorithmFPType, cpu>::create(p, nClusters, n, inClusters, blockSize);
         DAAL_CHECK(task.get(), services::ErrorMemoryAllocationFailed);
         {
             DAAL_ITTNOTIFY_SCOPED_TASK(addNTToTaskThreaded);
@@ -160,7 +161,7 @@ Status KMeansBatchKernel<method, algorithmFPType, cpu>::compute(const NumericTab
 
         {
             DAAL_ITTNOTIFY_SCOPED_TASK(kmeansPartialReduceCentroids);
-            task->template kmeansComputeCentroids<method>(clusterS0.get(), clusterS1.get(), dS1.get());
+            task->template kmeansComputeCentroids<method>(clusterS0.get(), clusterS2.get(), clusterS1.get(), dS1.get());
         }
 
         size_t cNum;
@@ -174,37 +175,49 @@ Status KMeansBatchKernel<method, algorithmFPType, cpu>::compute(const NumericTab
 
             for (size_t i = 0; i < nClusters; i++)
             {
-                if (clusterS0[i] > 0)
-                {
-                    const algorithmFPType coeff = 1.0 / clusterS0[i];
+                if (clusterS0[i] == 0) {
+                    DAAL_CHECK(cPos < cNum, services::ErrorKMeansNumberOfClustersIsTooLarge);
+                    newCentersGoalFunc += cValues[cPos];
+                    ReadRows<algorithmFPType, cpu> mtRow(ntData, cIndices[cPos], 1);
+                    const algorithmFPType * row = mtRow.get();
 
                     PRAGMA_IVDEP
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t j = 0; j < p; j++)
                     {
-                        const algorithmFPType newCluster = clusterS1[i * p + j] * coeff;
-                        const algorithmFPType dist       = clusters[i * p + j] - newCluster;
+                        const algorithmFPType dist = clusters[i * p + j] - row[j];
                         l2Norm += dist * dist;
-                        clusters[i * p + j] = newCluster;
                     }
+
+                    int indexes = clusterS2[cPos];
+                    clusterS0[indexes]--;
+                    const algorithmFPType coeff = 1.0 / clusterS0[indexes];
+
+                    for (size_t j = 0; j < p; j++)
+                    {
+                        clusterS1[indexes * p + j] -= row[j];
+                    }
+                    result |=
+                        daal::services::internal::daal_memcpy_s(&clusters[i * p], p * sizeof(algorithmFPType), row, p * sizeof(algorithmFPType));
+                    cPos++;
                 }
-                else
+            }
+
+            for (size_t i = 0; i < nClusters; i++)
+            {
+                if (clusterS0[i] > 0)
                 {
-                    DAAL_CHECK(cPos < cNum, services::ErrorKMeansNumberOfClustersIsTooLarge);
-                    newCentersGoalFunc += cValues[cPos];
-                    ReadRows<algorithmFPType, cpu> mtRow(ntData, cIndices[cPos], 1);
-                    const algorithmFPType * row = mtRow.get();
+                    const algorithmFPType coeff = 1.0 / clusterS0[i];
 
                     PRAGMA_IVDEP
                     PRAGMA_VECTOR_ALWAYS
                     for (size_t j = 0; j < p; j++)
                     {
-                        const algorithmFPType dist = clusters[i * p + j] - row[j];
+                        const algorithmFPType newCluster = clusterS1[i * p + j] * coeff;
+                        const algorithmFPType dist       = clusters[i * p + j] - newCluster;
                         l2Norm += dist * dist;
+                        clusters[i * p + j] = newCluster;
                     }
-                    result |=
-                        daal::services::internal::daal_memcpy_s(&clusters[i * p], p * sizeof(algorithmFPType), row, p * sizeof(algorithmFPType));
-                    cPos++;
                 }
             }
         }

@@ -62,8 +62,11 @@ Status KMeansDistributedStep1Kernel<method, algorithmFPType, cpu>::compute(size_
     algorithmFPType * initClusters = const_cast<algorithmFPType *>(mtInitClusters.get());
     WriteOnlyRows<int, cpu> mtClusterS0(*const_cast<NumericTable *>(r[0]), 0, nClusters);
     DAAL_CHECK_BLOCK_STATUS(mtClusterS0);
+    WriteOnlyRows<int, cpu> mtClusterS2(*const_cast<NumericTable *>(r[0]), 0, n);
+    DAAL_CHECK_BLOCK_STATUS(mtClusterS2)
     /* TODO: That should be size_t or double */
     int * clusterS0 = mtClusterS0.get();
+    int * clusterS2 = mtClusterS2.get();
     WriteOnlyRows<algorithmFPType, cpu> mtClusterS1(*const_cast<NumericTable *>(r[1]), 0, nClusters);
     DAAL_CHECK_BLOCK_STATUS(mtClusterS1);
     algorithmFPType * clusterS1 = mtClusterS1.get();
@@ -116,7 +119,7 @@ Status KMeansDistributedStep1Kernel<method, algorithmFPType, cpu>::compute(size_
     Status s;
     algorithmFPType oldTargetFunc = (algorithmFPType)0.0;
     {
-        auto task = TaskKMeansLloyd<algorithmFPType, cpu>::create(p, nClusters, initClusters, blockSize);
+        auto task = TaskKMeansLloyd<algorithmFPType, cpu>::create(p, nClusters, n, initClusters, blockSize);
         DAAL_CHECK(task.get(), services::ErrorMemoryAllocationFailed);
         DAAL_ASSERT(task);
 
@@ -142,7 +145,7 @@ Status KMeansDistributedStep1Kernel<method, algorithmFPType, cpu>::compute(size_
             DAAL_CHECK(dS1.get(), services::ErrorMemoryAllocationFailed);
         }
 
-        task->template kmeansComputeCentroids<method>(clusterS0, clusterS1, dS1.get());
+        task->template kmeansComputeCentroids<method>(clusterS0, clusterS2, clusterS1, dS1.get());
 
         size_t cNum;
         DAAL_CHECK_STATUS(s, task->kmeansComputeCentroidsCandidates(cValues, cIndices.get(), cNum));

@@ -53,11 +53,12 @@ struct TlsTask
 {
     DAAL_NEW_DELETE();
 
-    TlsTask(int dim, int clNum, int maxBlockSize)
+    TlsTask(int dim, int clNum, int nSamples, int maxBlockSize)
     {
         mklBuff  = service_scalable_calloc<algorithmFPType, cpu>(maxBlockSize * clNum);
         cS1      = service_scalable_calloc<algorithmFPType, cpu>(clNum * dim);
         cS0      = service_scalable_calloc<int, cpu>(clNum);
+        cS2      = service_scalable_calloc<int, cpu>(nSamples);
         cValues  = service_scalable_calloc<algorithmFPType, cpu>(clNum);
         cIndices = service_scalable_calloc<size_t, cpu>(clNum);
     }
@@ -76,6 +77,10 @@ struct TlsTask
         {
             service_scalable_free<int, cpu>(cS0);
         }
+        if (cS2)
+        {
+            service_scalable_free<int, cpu>(cS2);
+        }
         if (cValues)
         {
             service_scalable_free<algorithmFPType, cpu>(cValues);
@@ -86,14 +91,14 @@ struct TlsTask
         }
     }
 
-    static TlsTask<algorithmFPType, cpu> * create(const size_t dim, const size_t clNum, const size_t maxBlockSize)
+    static TlsTask<algorithmFPType, cpu> * create(const size_t dim, const size_t clNum, const size_t nSamples, const size_t maxBlockSize)
     {
-        TlsTask<algorithmFPType, cpu> * result = new TlsTask<algorithmFPType, cpu>(dim, clNum, maxBlockSize);
+        TlsTask<algorithmFPType, cpu> * result = new TlsTask<algorithmFPType, cpu>(dim, clNum, nSamples, maxBlockSize);
         if (!result)
         {
             return nullptr;
         }
-        if (!result->mklBuff || !result->cS1 || !result->cS0)
+        if (!result->mklBuff || !result->cS1 || !result->cS0 || !result->cS2)
         {
             delete result;
             return nullptr;
@@ -104,6 +109,7 @@ struct TlsTask
     algorithmFPType * mklBuff = nullptr;
     algorithmFPType * cS1     = nullptr;
     int * cS0                 = nullptr;
+    int * cS2                 = nullptr;
     algorithmFPType goalFunc  = 0.0;
     size_t cNum               = 0;
     algorithmFPType * cValues = nullptr;

@@ -51,15 +51,16 @@ struct TaskKMeansLloyd
 {
     DAAL_NEW_DELETE();
 
-    TaskKMeansLloyd(int _dim, int _clNum, algorithmFPType * _centroids, const size_t max_block_size)
+    TaskKMeansLloyd(int _dim, int _clNum, int _nSamples, algorithmFPType * _centroids, const size_t max_block_size)
     {
-        dim      = _dim;
-        clNum    = _clNum;
-        cCenters = _centroids;
+        dim              = _dim;
+        clNum            = _clNum;
+        nSamples         = _nSamples;
+        cCenters         = _centroids;
 
         /* Allocate memory for all arrays inside TLS */
         tls_task = new daal::static_tls<TlsTask<algorithmFPType, cpu> *>([=]() -> TlsTask<algorithmFPType, cpu> * {
-            return TlsTask<algorithmFPType, cpu>::create(dim, clNum, max_block_size);
+            return TlsTask<algorithmFPType, cpu>::create(dim, clNum, nSamples, max_block_size);
         }); /* Allocate memory for all arrays inside TLS: end */
 
         clSq = service_scalable_calloc<algorithmFPType, cpu>(clNum);
@@ -92,9 +93,9 @@ struct TaskKMeansLloyd
         }
     }
 
-    static SharedPtr<TaskKMeansLloyd<algorithmFPType, cpu> > create(int dim, int clNum, algorithmFPType * centroids, const size_t max_block_size)
+    static SharedPtr<TaskKMeansLloyd<algorithmFPType, cpu> > create(int dim, int clNum, int nSamples, algorithmFPType * centroids, const size_t max_block_size)
     {
-        SharedPtr<TaskKMeansLloyd<algorithmFPType, cpu> > result(new TaskKMeansLloyd<algorithmFPType, cpu>(dim, clNum, centroids, max_block_size));
+        SharedPtr<TaskKMeansLloyd<algorithmFPType, cpu> > result(new TaskKMeansLloyd<algorithmFPType, cpu>(dim, clNum, nSamples, centroids, max_block_size));
         if (result.get() && (!result->tls_task || !result->clSq))
         {
             result.reset();
@@ -115,8 +116,11 @@ struct TaskKMeansLloyd
     template <typename centroidsFPType>
     int kmeansUpdateCluster(int jidx, centroidsFPType * s1);
 
+    template <typename centroidsFPType>
+    int kmeansUpdatePoints(int jidx);
+
     template <Method method>
-    void kmeansComputeCentroids(int * clusterS0, algorithmFPType * clusterS1, double * auxData);
+    void kmeansComputeCentroids(int * clusterS0, int * clusterS2, algorithmFPType * clusterS1, double * auxData);
 
     void kmeansInsertCandidate(TlsTask<algorithmFPType, cpu> * tt, algorithmFPType value, size_t index);
 
@@ -130,6 +134,7 @@ struct TaskKMeansLloyd
 
     int dim;
     int clNum;
+    int nSamples;
 
     typedef typename Fp2IntSize<algorithmFPType>::IntT algIntType;
 };
@@ -162,6 +167,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedDense(const Num
         algorithmFPType * x_clusters = tt->mklBuff;
 
         int * cS0             = tt->cS0;
+        int * cS2             = tt->cS2;
         algorithmFPType * cS1 = tt->cS1;
 
         int * assignments = nullptr;
@@ -231,7 +237,9 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedDense(const Num
             }
 
             kmeansInsertCandidate(tt, minGoalVal, k * blockSizeDefault + i);
+
             cS0[minIdx]++;
+            cS2[i] = minIdx;
 
             goal += minGoalVal;
 
@@ -281,6 +289,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer
         algorithmFPType * x_clusters = tt->mklBuff;
 
         int * cS0             = tt->cS0;
+        int * cS2             = tt->cS2;
         algorithmFPType * cS1 = tt->cS1;
 
         int * assignments = nullptr;
@@ -332,6 +341,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer
             *trg += minGoalVal;
 
             cS0[minIdx]++;
+            cS2[i] = minIdx;
 
             if (ntAssign)
             {
@@ -381,16 +391,30 @@ int TaskKMeansLloyd<algorithmFPType, cpu>::kmeansUpdateCluster(int jidx, centroi
     return s0;
 }
 
+template <typename algorithmFPType, CpuType cpu>
+template <typename centroidsFPType>
+int TaskKMeansLloyd<algorithmFPType, cpu>::kmeansUpdatePoints(int jidx)
+{
+    int idx = (int)jidx;
+
+    int ji = 0;
+
+    tls_task->reduce([&](TlsTask<algorithmFPType, cpu> * tt) -> void { ji += tt->cS2[tt->cIndices[idx]]; });
+
+    return ji;
+}
+
 template <typename algorithmFPType, CpuType cpu>
 template <Method method>
-void TaskKMeansLloyd<algorithmFPType, cpu>::kmeansComputeCentroids(int * clusterS0, algorithmFPType * clusterS1, double * auxData)
+void TaskKMeansLloyd<algorithmFPType, cpu>::kmeansComputeCentroids(int * clusterS0, int * clusterS2, algorithmFPType * clusterS1, double * auxData)
 {
     if (method == defaultDense && auxData)
     {
         for (size_t i = 0; i < clNum; i++)
         {
             service_memset_seq<double, cpu>(auxData, 0.0, dim);
             clusterS0[i] = kmeansUpdateCluster<double>(i, auxData);
+            clusterS2[i] = kmeansUpdatePoints<double>(i);
 
             PRAGMA_IVDEP
             PRAGMA_VECTOR_ALWAYS
@@ -406,6 +430,7 @@ void TaskKMeansLloyd<algorithmFPType, cpu>::kmeansComputeCentroids(int * cluster
         {
             service_memset_seq<algorithmFPType, cpu>(&clusterS1[i * dim], 0.0, dim);
             clusterS0[i] = kmeansUpdateCluster<algorithmFPType>(i, &clusterS1[i * dim]);
+            clusterS2[i] = kmeansUpdatePoints<double>(i);
         }
     }
 }