some first steps at understanding the paper / code

kylemcdonald · Feb 25, 2016 · acdff0d · acdff0d
1 parent 2a03b23
commit acdff0d
Showing 1 changed file with 215 additions and 0 deletions.
diff --git a/Parametric t-SNE (Keras).ipynb b/Parametric t-SNE (Keras).ipynb
@@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# import os\n",
+    "# os.environ['KERAS_BACKEND'] = 'tensorflow'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from __future__ import print_function\n",
+    "import numpy as np\n",
+    "np.random.seed(0)\n",
+    "\n",
+    "from keras.datasets import mnist\n",
+    "from keras.models import Sequential\n",
+    "from keras.layers.core import Dense, Dropout, Activation\n",
+    "from keras.optimizers import SGD\n",
+    "from keras.utils import np_utils\n",
+    "from keras.objectives import categorical_crossentropy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.19 s, sys: 429 ms, total: 1.62 s\n",
+      "Wall time: 1.64 s\n",
+      "60000 train samples\n",
+      "10000 test samples\n"
+     ]
+    }
+   ],
+   "source": [
+    "%time (X_train, y_train), (X_test, y_test) = mnist.load_data()\n",
+    "\n",
+    "X_train = X_train.reshape(60000, 784)\n",
+    "X_test = X_test.reshape(10000, 784)\n",
+    "X_train = X_train.astype('float32')\n",
+    "X_test = X_test.astype('float32')\n",
+    "X_train /= 255\n",
+    "X_test /= 255\n",
+    "print(X_train.shape[0], 'train samples')\n",
+    "print(X_test.shape[0], 'test samples')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import theano\n",
+    "from theano.tensor.extra_ops import fill_diagonal\n",
+    "from theano import tensor as T\n",
+    "from keras import backend as K\n",
+    "\n",
+    "batch_size = 128\n",
+    "\n",
+    "# function [H, P] = Hbeta(D, beta)\n",
+    "def Hbeta(D, beta):\n",
+    "    # P = exp(-D * beta);\n",
+    "    P = K.exp(-D * beta)\n",
+    "    # sumP = sum(P);\n",
+    "    sumP = K.sum(P)\n",
+    "    # H = log(sumP) + beta * sum(D .* P) / sumP;\n",
+    "    H = K.log(sumP) + beta * K.sum(K.prod(D, P)) / sumP\n",
+    "    # P = P / sumP;\n",
+    "    P = P / sumP\n",
+    "    return H, P\n",
+    "\n",
+    "# https://github.com/kylemcdonald/Parametric-t-SNE/blob/master/src/x2p.m\n",
+    "def x2p(X, u=15, tol=1e-4):\n",
+    "    # n = size(X, 1);                     % number of instances\n",
+    "#     n = K.eval(K.shape(X)[0]) # this doesn't work: \"An input of the graph .. was not provided and not given a value\"\n",
+    "    n = batch_size\n",
+    "    # P = zeros(n, n);                    % empty probability matrix\n",
+    "    P = K.zeros((n, n))\n",
+    "    # beta = ones(n, 1);                  % empty precision vector\n",
+    "    beta = K.ones((n, 1))\n",
+    "    # logU = log(u);                      % log of perplexity (= entropy)\n",
+    "    logU = K.log(u)\n",
+    "    \n",
+    "    # sum_X = sum(X .^ 2, 2);\n",
+    "    sum_X = K.sum(K.square(X), axis=1)\n",
+    "    # D = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * X * X'));\n",
+    "    D = sum_X + (K.transpose(sum_X) + -2 * X * K.transpose(X))\n",
+    "    \n",
+    "    for i in range(n):\n",
+    "        # Di = D(i, [1:i-1 i+1:end]);\n",
+    "        Di = D[i] # can we use the whole row and make the diagonal zero later?\n",
+    "        # [H, thisP] = Hbeta(Di, beta(i));\n",
+    "        H, thisP = Hbeta(Di, beta[i])\n",
+    "        \n",
+    "        # ... a lot more right here\n",
+    "        \n",
+    "        P[i] = thisP\n",
+    "    \n",
+    "    return P #, beta\n",
+    "    \n",
+    "# curX is the high-dimensional input (Keras loss functions call this y_true)\n",
+    "# activations is the low-dimensional output (Keras loss functions call this y_pred)\n",
+    "def tsne(curX, activations):\n",
+    "    perplexity = 30\n",
+    "    \n",
+    "    # these joint probabilities should be pre-computed per-batch and passed to the fit() function\n",
+    "    \n",
+    "    # P{i} = x2p(curX{i}, perplexity, 1e-5); % compute affinities using fixed perplexity\n",
+    "    P = x2p(curX, perplexity, 1e-5)\n",
+    "    # P{i}(isnan(P{i})) = 0;                 % make sure we don't have NaN's\n",
+    "    # P = T.set_subtensor(P[T.isnan(P)], 0) # something like this?\n",
+    "    # P = T.switch(T.isnan(P), 0, P) # or like this? \n",
+    "    # P{i} = (P{i} + P{i}') / 2;             % make symmetric\n",
+    "    P = (P + K.transpose(P)) / 2 # this seems to be missing the step of normalizing by \"2n\", just normalizes by \"2\"\n",
+    "    # P{i} = P{i} ./ sum(P{i}(:));           % obtain estimation of joint probabilities\n",
+    "    P = P / K.sum(P) # but maybe this makes up for the missing \"n\" above?\n",
+    "    # P{i} = max(P{i}, eps);\n",
+    "    P = K.maximum(P, K.epsilon())\n",
+    "\n",
+    "    # v = length(network{end}.bias_upW) - 1\n",
+    "    v = K.shape(activations)[1] - 1\n",
+    "    \n",
+    "    # sum_act = sum(activations .^ 2, 2)\n",
+    "    sum_act = K.sum(K.square(activations), axis=1)\n",
+    "    # Q = (1 + (bsxfun(@plus, sum_act, bsxfun(@plus, sum_act', -2 * activations * activations')) ./ v)) .^ -((v + 1) / 2)\n",
+    "    Q = K.pow(1 + ((sum_act + (K.transpose(sum_act) + -2 * activations * K.transpose(activations))) / v), -((v + 1) / 2))\n",
+    "    # Q(1:n+1:end) = 0\n",
+    "    fill_diagonal(Q, 0) # Theano-only\n",
+    "    # Q = Q ./ sum(Q(:))\n",
+    "    Q = K.maximum(Q, K.epsilon())\n",
+    "    \n",
+    "    # C = sum(sum(P{1} .* log((P{1} + eps) ./ (Q + eps))))\n",
+    "    C = K.sum(K.sum(K.prod(P, K.log((P + K.epsilon()) / (Q + K.epsilon())))))\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model = Sequential()\n",
+    "model.add(Dense(500, input_shape=(784,)))\n",
+    "model.add(Activation('relu'))\n",
+    "model.add(Dense(500))\n",
+    "model.add(Activation('relu'))\n",
+    "model.add(Dense(2000))\n",
+    "model.add(Activation('relu'))\n",
+    "model.add(Dense(2))\n",
+    "\n",
+    "sgd = SGD()\n",
+    "%time model.compile(loss=tsne, optimizer=sgd)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model.fit(X_train, Y_train,\n",
+    "          batch_size=batch_size,\n",
+    "          nb_epoch=20,\n",
+    "          verbose=2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}