Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
vmarkovtsev committed Feb 21, 2017
2 parents 6ec130a + 02e824a commit 0eb313d
Show file tree
Hide file tree
Showing 15 changed files with 860 additions and 74 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
.idea
cmake-build-*
**/*.cbp
**/CMakeCache.txt
**/CMakeFiles
**/.DS_Store
Expand Down
3 changes: 2 additions & 1 deletion .travis.linux
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/sh

rm -rf /opt/python
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
sudo apt-get update
sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy r-base-core cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 --slave /usr/bin/g++ g++ /usr/bin/g++-5
3 changes: 2 additions & 1 deletion .travis.osx
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/bin/sh

brew install llvm --with-clang
brew install python3
brew tap homebrew/science
brew install python3 r
pip3 install numpy
brew cask update
brew cask install --verbose cuda
170 changes: 147 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ ball tree.

Technically, this project is a library which exports the two functions
defined in `kmcuda.h`: `kmeans_cuda` and `knn_cuda`.
It has the built-in Python3 native extension support, so you can
`from libKMCUDA import kmeans_cuda`.
It has the built-in Python3 and R native extension support, so you can
`from libKMCUDA import kmeans_cuda` or `dyn.load("libKMCUDA.so")`.

[![source{d}](img/sourced.png)](http://sourced.tech)
<p align="right"><a href="img/kmeans_image.ipynb">How this was created?</a></p>
Expand All @@ -33,16 +33,23 @@ Table of contents
* [macOS](#macos)
* [Testing](#testing)
* [Benchmarks](#benchmarks)
* [100000x256@1024](#100000x2561024)
* [100,000x256@1024](#100000x2561024)
* [Configuration](#configuration)
* [Contestants](#contestants)
* [Data](#data)
* [Notes](#notes-1)
* [8,000,000x256@1024](#8000000x2561024)
* [Data](#data-1)
* [Notes](#notes-2)
* [Python examples](#python-examples)
* [K-means, L2 (Euclidean) distance](#k-means-l2-euclidean-distance)
* [K-means, angular (cosine) distance average](#k-means-angular-cosine-distance--average)
* [K-means, angular (cosine) distance + average](#k-means-angular-cosine-distance--average)
* [K-nn](#k-nn-1)
* [Python API](#python-api)
* [R examples](#r-examples)
* [K-means](#k-means-1)
* [K-nn](#k-nn-2)
* [R API](#r-api)
* [C examples](#c-examples)
* [C API](#c-api)
* [License](#license)
Expand Down Expand Up @@ -123,6 +130,7 @@ It requires cudart 8.0 / Pascal and OpenMP 4.0 capable compiler. The build has
been tested primarily on Linux but it works on macOS too with some blows and whistles
(see "macOS" subsection).
If you do not want to build the Python native module, add `-D DISABLE_PYTHON=y`.
If you do not want to build the R native module, add `-D DISABLE_R=y`.
If CUDA is not automatically found, add `-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0`
(change the path to the actual one). By default, CUDA kernels are compiled for
the architecture 60 (Pascal). It is possible to override it via `-D CUDA_ARCH=52`,
Expand Down Expand Up @@ -167,8 +175,6 @@ Benchmarks
----------

### 100000x256@1024
Comparison of some KMeans implementations:

| | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPU |
|------------|----------------|-----------|------------------|--------|--------|--------------|
| time, s | 164 | 36 | 20 | 10.6 | 9.2 | 5.5 |
Expand All @@ -193,6 +199,21 @@ Comparison of some KMeans implementations:
#### Notes
100000 is the maximum size Serban KMeans can handle.

### 8000000x256@1024
| | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda 2 GPU | kmcuda Yinyang 2 GPU |
|------------|----------------|-----------|------------------|--------|--------------|----------------------|
| time | please no | - | 6h 34m | fail | 44m | 36m |
| memory, GB | - | - | 205 | fail | 8.7 | 10.4 |

kmeans++ initialization, 93 iterations (1% reassignments equivalent).

#### Data
8,000,000 secret production samples.

#### Notes
KmeansRex did eat 205 GB of RAM on peak; it uses dynamic memory so it constantly
bounced from 100 GB to 200 GB.

Python examples
---------------

Expand Down Expand Up @@ -276,7 +297,7 @@ calculated 0.276552 of all the distances
Python API
----------
```python
def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
def kmeans_cuda(samples, clusters, tolerance=0.01, init="k-means++",
yinyang_t=0.1, metric="L2", average_distance=False,
seed=time(), device=0, verbosity=0)
```
Expand All @@ -289,18 +310,20 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",

**clusters** integer, the number of clusters.

**tolerance** float, if the relative number of reassignments drops below this value, stop.
**tolerance** float, if the relative number of reassignments drops below this value,
algorithm stops.

**init** string or numpy array, sets the method for centroids initialization,
may be "k=means++"/"kmeans++", "random" or numpy array of shape
may be "k-means++", "afk-mc2", "random" or numpy array of shape
\[**clusters**, number of features\]. dtype must be float32.

**yinyang_t** float, the relative number of cluster groups, usually 0.1.
0 disables Yinyang refinement.

**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
can be changed to "cos" to behave as Spherical K-means with the
angular distance. Please note that samples *must* be normalized in that
case.
it can be changed to "cos" to change the algorithm to Spherical K-means
with the angular distance. Please note that samples *must* be normalized
in the latter case.

**average_distance** boolean, the value indicating whether to calculate
the average distance between cluster elements and
Expand All @@ -309,17 +332,18 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",

**seed** integer, random generator seed for reproducible results.

**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, 2 means second device,
3 means using first and second device. Special value 0 enables all available devices.
The default is 0.
**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
2 means second device, 3 means using first and second device. Special
value 0 enables all available devices. The default is 0.

**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
2 means lots of output.

**return** tuple(centroids, assignments). If **samples** was a numpy array or
a host pointer tuple, the types are numpy arrays, otherwise, raw pointers
(integers) allocated on the same device. If **samples** are float16,
the returned centroids are float16 too.
**return** tuple(centroids, assignments, \[average_distance\]).
If **samples** was a numpy array or a host pointer tuple, the types
are numpy arrays, otherwise, raw pointers (integers) allocated on the
same device. If **samples** are float16, the returned centroids are
float16 too.

```python
def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
Expand All @@ -342,6 +366,108 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit
to be compatible with uint32. If **samples** is a tuple then
**assignments** is a pointer. The shape is (number of samples,).

**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
it can be changed to "cos" to change the algorithm to Spherical K-means
with the angular distance. Please note that samples *must* be normalized
in the latter case.

**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
2 means second device, 3 means using first and second device. Special
value 0 enables all available devices. The default is 0.

**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
2 means lots of output.

**return** neighbor indices. If **samples** was a numpy array or
a host pointer tuple, the return type is numpy array, otherwise, a
raw pointer (integer) allocated on the same device. The shape is
(number of samples, k).

R examples
----------
#### K-means
```R
dyn.load("libKMCUDA.so")
samples = replicate(4, runif(16000))
result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
seed=777, verbosity=1, average_distance=TRUE)
print(result$average_distance)
print(result$centroids[1:10,])
print(result$assignments[1:10])
```

#### K-nn
```R
dyn.load("libKMCUDA.so")
samples = replicate(4, runif(16000))
cls = .External("kmeans_cuda", samples, 50, tolerance=0.01,
seed=777, verbosity=1)
result = .External("knn_cuda", 20, samples, cls$centroids, cls$assignments,
verbosity=1)
print(result[1:10,])
```

R API
-----
```R
function kmeans_cuda(
samples, clusters, tolerance=0.01, init="k-means++", yinyang_t=0.1,
metric="L2", average_distance=FALSE, seed=Sys.time(), device=0, verbosity=0)
```
**samples** real matrix of shape \[number of samples, number of features\]
or list of real matrices which are rbind()-ed internally. No more
than INT32_MAX samples and UINT16_MAX features are supported.

**clusters** integer, the number of clusters.

**tolerance** real, if the relative number of reassignments drops below this value,
algorithm stops.

**init** character vector or real matrix, sets the method for centroids initialization,
may be "k-means++", "afk-mc2", "random" or real matrix, of shape
\[**clusters**, number of features\].

**yinyang_t** real, the relative number of cluster groups, usually 0.1.
0 disables Yinyang refinement.

**metric** character vector, the name of the distance metric to use. The default
is Euclidean (L2), it can be changed to "cos" to change the algorithm
to Spherical K-means with the angular distance. Please note that
samples *must* be normalized in the latter case.

**average_distance** logical, the value indicating whether to calculate
the average distance between cluster elements and
the corresponding centroids. Useful for finding
the best K. Returned as the third list element.

**seed** integer, random generator seed for reproducible results.

**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
2 means second device, 3 means using first and second device. Special
value 0 enables all available devices. The default is 0.

**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
2 means lots of output.

**return** list(centroids, assignments\[, average_distance\]). Indices in
assignments start from 1.

```R
function knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
```
**k** integer, the number of neighbors to search for each sample. Must be ≤ 1<sup>16</sup>.

**samples** real matrix of shape \[number of samples, number of features\]
or list of real matrices which are rbind()-ed internally.
In the latter case, is is possible to pass in more than INT32_MAX
samples.

**centroids** real matrix with precalculated clusters' centroids (e.g., using
kmeans() or kmeans_cuda()).

**assignments** integer vector with sample-cluster associations. Indices start
from 1.

**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
can be changed to "cos" to behave as Spherical K-means with the
angular distance. Please note that samples *must* be normalized in that
Expand All @@ -354,10 +480,8 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit
**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
2 means lots of output.

**return** neighbor indices. If **samples** was a numpy array or
a host pointer tuple, the return type is numpy array, otherwise, a
raw pointer (integer) allocated on the same device. The shape is
(number of samples, k).
**return** integer matrix with neighbor indices. The shape is (number of samples, k).
Indices start from 1.

C examples
----------
Expand Down
16 changes: 13 additions & 3 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
cmake_minimum_required(VERSION 3.2)
project(KMCUDA)
set(CMAKE_MODULE_PATH ${CMAKE_HOME_DIRECTORY}/cmake)
find_package(OpenMP REQUIRED)
if (APPLE AND NOT CUDA_HOST_COMPILER)
# https://gitlab.kitware.com/cmake/cmake/issues/13674
Expand All @@ -24,7 +25,9 @@ if (NOT DISABLE_PYTHON)
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.get_include())" OUTPUT_VARIABLE NUMPY_INCLUDES)
endif()
endif()

if (NOT DISABLE_R)
find_package(R)
endif()
if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPROFILE")
endif()
Expand All @@ -35,9 +38,12 @@ endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh
metric_abstraction.h kmeans.cu knn.cu transpose.cu)
if (NOT DISABLE_PYTHON)
if (PYTHONLIBS_FOUND)
list(APPEND SOURCE_FILES python.cc)
endif()
if (R_FOUND)
list(APPEND SOURCE_FILES r.cc)
endif()
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(NVCC_FLAGS "-G -g")
endif()
Expand All @@ -59,10 +65,14 @@ if (APPLE)
set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS_BACKUP}")
endif()
target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY})
if(PYTHONLIBS_FOUND)
if (PYTHONLIBS_FOUND)
include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES})
target_link_libraries(KMCUDA ${PYTHON_LIBRARIES})
endif()
if (R_FOUND)
include_directories(${R_INCLUDE_DIRS})
target_link_libraries(KMCUDA ${R_LIBRARIES})
endif()
if (SUFFIX)
set_target_properties(KMCUDA PROPERTIES SUFFIX ${SUFFIX})
endif()
55 changes: 55 additions & 0 deletions src/cmake/FindR.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# CMake module to find R
# - Try to find R
# Once done, this will define
#
# R_FOUND - system has R
# R_INCLUDE_DIRS - the R include directories
# R_LIBRARIES - link these to use R
# R_ROOT_DIR - As reported by R
# Autor: Omar Andres Zapata Mesa 31/05/2013
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(CMAKE_FIND_APPBUNDLE "LAST")
endif()
find_program(R_EXECUTABLE NAMES R R.exe)
#---searching R installtion unsing R executable
if(R_EXECUTABLE)
execute_process(COMMAND ${R_EXECUTABLE} RHOME
OUTPUT_VARIABLE R_ROOT_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE)
find_path(R_INCLUDE_DIR R.h
HINTS ${R_ROOT_DIR}
PATHS /usr/local/lib /usr/local/lib64 /usr/share
PATH_SUFFIXES include R/include
DOC "Path to file R.h")
find_library(R_LIBRARY R
HINTS ${R_ROOT_DIR}/lib
DOC "R library (example libR.a, libR.dylib, etc.).")
endif()
#---setting include dirs and libraries
set(R_LIBRARIES ${R_LIBRARY})
set(R_INCLUDE_DIRS ${R_INCLUDE_DIR})
foreach(_cpt ${R_FIND_COMPONENTS})
execute_process(COMMAND echo "cat(find.package('${_cpt}'))"
COMMAND ${R_EXECUTABLE} --vanilla --slave
OUTPUT_VARIABLE _cpt_path
OUTPUT_STRIP_TRAILING_WHITESPACE)
find_library(R_${_cpt}_LIBRARY
lib${_cpt}.so lib${_cpt}.dylib
HINTS ${_cpt_path}/lib)
if(R_${_cpt}_LIBRARY)
mark_as_advanced(R_${_cpt}_LIBRARY)
list(APPEND R_LIBRARIES ${R_${_cpt}_LIBRARY})
endif()
find_path(R_${_cpt}_INCLUDE_DIR ${_cpt}.h HINTS ${_cpt_path} PATH_SUFFIXES include R/include)
if(R_${_cpt}_INCLUDE_DIR)
mark_as_advanced(R_${_cpt}_INCLUDE_DIR)
list(APPEND R_INCLUDE_DIRS ${R_${_cpt}_INCLUDE_DIR})
endif()
if(R_${_cpt}_INCLUDE_DIR AND R_${_cpt}_LIBRARY)
list(REMOVE_ITEM R_FIND_COMPONENTS ${_cpt})
endif()
endforeach()
# Handle the QUIETLY and REQUIRED arguments and set R_FOUND to TRUE if all listed variables are TRUE
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(R DEFAULT_MSG R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
mark_as_advanced(R_FOUND R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
Loading

0 comments on commit 0eb313d

Please sign in to comment.