From 448a3be1d2e2ea97638ef7d1680eab96b9ead3ad Mon Sep 17 00:00:00 2001
From: Mike Sconzo <sooshie@gmail.com>
Date: Fri, 15 Aug 2014 13:34:44 -0500
Subject: [PATCH] initial import of automagic scripts

---
 clusterfck/clusterfck_macho  | 487 +++++++++++++++++++++++++++++++++++
 clusterfck/clusterfck_pefile | 431 +++++++++++++++++++++++++++++++
 2 files changed, 918 insertions(+)
 create mode 100755 clusterfck/clusterfck_macho
 create mode 100755 clusterfck/clusterfck_pefile

diff --git a/clusterfck/clusterfck_macho b/clusterfck/clusterfck_macho
new file mode 100755
index 0000000..5f10cc9
--- /dev/null
+++ b/clusterfck/clusterfck_macho
@@ -0,0 +1,487 @@
+#!/usr/bin/python
+"""
+This script will parse a group of Mach-O files, retreive a subset of features/attributes from each file,
+perform various clustering algorithms on the features, and then given the files in each cluster will
+generate Yara signatures that can help identify similar files.
+"""
+
+import os
+import sys
+import math
+import glob
+import macholib.MachO
+import collections
+import numpy as np
+import pandas as pd
+from argparse import ArgumentParser
+import data_hacking
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from sklearn.preprocessing import scale
+from sklearn.decomposition import PCA
+
+include_cmd_size = False
+
+def save_graphic(df, cols, signature_directory, cluster_type):
+    """
+    Print a simple 3D graph of the cluster layout after using PCA to reduce the clusters to 
+    three dimensions.
+    """
+    print "Generating graphic of cluster representation"
+    X = df.as_matrix(cols)
+    X = scale(X)
+    X = PCA(n_components=3).fit_transform(X)
+    fig = plt.figure(figsize=plt.figaspect(1))
+    ax = fig.add_subplot(1, 1, 1, projection='3d')
+    ax.scatter(X[:,0], X[:,1], X[:,2], c=df['cluster'], s=50)
+    ax.set_title(cluster_type + " Clusters")
+    fig.set_size_inches(24,18)
+    fig.set_dpi(400)
+    fig.savefig(cluster_type + '.png', bbox_inches='tight')
+
+def e_features(filename):
+    """
+    Extract the features from the given filename.
+
+    Uses macholib to do the file parsing, only a subset of features are chosen from the parsed file
+    """
+    results = []
+    try:
+        features = {}
+        mfile = macholib.MachO.MachO(filename)
+        if mfile.fat:
+            features['number of architectures'] = mfile.fat.nfat_arch
+        else:
+            features['number of architectures'] = 1
+
+        for header in mfile.headers:
+            cmd_count_size = collections.Counter()
+            cmd_count = collections.Counter()
+            #features['magic'] = int(header.MH_MAGIC)
+            #features['h_size'] = int(header.size)
+            #features['h_offset'] = int(header.offset)
+            for cmd in header.commands:
+                load_cmd = cmd[0]
+                cmd_info = cmd[1]
+                cmd_data = cmd[2]
+                cmd_name = load_cmd.get_cmd_name()
+                try:
+                    cmd_count[cmd_name] += 1
+                    cmd_count_size[cmd_name + ":" + str(load_cmd.cmdsize)] += 1
+                    if cmd_name in ('LC_SEGMENT', 'LC_SEGMENT_64'):
+                        bits = ''
+                        if cmd_name == 'LC_SEGMENT_64':
+                            bits = "64"
+                        for section_data in cmd_data:
+                            sec_info = section_data.describe()
+                            if sec_info['segname'] == '__PAGEZERO':
+                                features['lc_segment_' + bits + '_vmaddr'] = sec_info['vmaddr']
+                                features['lc_segment_' + bits + '_vmsize'] = sec_info['vmsize']
+                                features['lc_segment_' + bits + '_filesize'] = sec_info['filesize']
+                                features['lc_segment_' + bits + '_fileoff'] = sec_info['fileoff']
+                    if cmd_name == 'LC_VERSION_MIN_MACOSX':
+                        features['lc_version_min_macosx_min_version'] = float('.'.join(cmd_info.describe()['version'].split('.')[:2]))
+                    if cmd_name == 'LC_SYMTAB':
+                        info = cmd_info.describe()
+                        features['lc_symtab_strsize'] = info['strsize']
+                        features['lc_symtab_stroff'] = info['stroff']
+                        features['lc_symtab_symoff'] = info['symoff']
+                        features['lc_symtab_nsyms'] = info['nsyms']
+                    if cmd_name in ('LC_DYLD_INFO_ONLY', 'LC_DYLD_INFO'):
+                        info = cmd_info.describe()
+                        features['lc_dyld_info_lazy_bind_size'] = info['lazy_bind_size']
+                        features['lc_dyld_info_rebase_size'] = info['rebase_size']
+                        features['lc_dyld_info_lazy_bind_off'] = info['lazy_bind_off']
+                        features['lc_dyld_info_export_off'] = info['export_off']
+                        features['lc_dyld_info_export_size'] = info['export_size']
+                        features['lc_dyld_info_bind_off'] = info['bind_off']
+                        features['lc_dyld_info_rebase_off'] = info['rebase_off']
+                        features['lc_dyld_info_bind_size'] = info['bind_size']
+                        features['lc_dyld_info_weak_bind_size'] = info['weak_bind_size']
+                        features['lc_dyld_info_weak_bind_off'] = info['weak_bind_off']
+                    if cmd_name == 'LC_DYSYMTAB':
+                        info = cmd_info.describe()
+                        features['lc_dysymtab_nextdefsym'] = info['nextdefsym']
+                        features['lc_dysymtab_extreloff'] = info['extreloff']
+                        features['lc_dysymtab_nlocrel'] = info['nlocrel']
+                        features['lc_dysymtab_modtaboff'] = info['modtaboff']
+                        features['lc_dysymtab_iundefsym'] = info['iundefsym']
+                        features['lc_dysymtab_ntoc'] = info['ntoc']
+                        features['lc_dysymtab_ilocalsym'] = info['ilocalsym']
+                        features['lc_dysymtab_nundefsym'] = info['nundefsym']
+                        features['lc_dysymtab_nextrefsyms'] = info['nextrefsyms']
+                        features['lc_dysymtab_locreloff'] = info['locreloff']
+                        features['lc_dysymtab_nmodtab'] = info['nmodtab']
+                        features['lc_dysymtab_nlocalsym'] = info['nlocalsym']
+                        features['lc_dysymtab_tocoff'] = info['tocoff']
+                        features['lc_dysymtab_extrefsymoff'] = info['extrefsymoff']
+                        features['lc_dysymtab_nindirectsyms'] = info['nindirectsyms']
+                        features['lc_dysymtab_iextdefsym'] = info['iextdefsym']
+                        features['lc_dysymtab_nextrel'] = info['nextrel']
+                        features['lc_dysymtab_indirectsymoff'] = info['indirectsymoff']
+
+                except Exception as e:
+                    print 'Error getting command information: ' + str(e)
+            features['filename'] = os.path.split(filename)[1]
+
+            # Remove these 2 load command types
+            for lc in ['LC_MAIN', 'LC_UNIXTHREAD']:
+                if lc in features:
+                    features.pop(lc, None)
+
+            if include_cmd_size:
+                features.update(cmd_count_size)
+            else:
+                features.update(cmd_count)
+            results.append(features)
+    except Exception as e:
+        print "Error processing %s - %s" % (filename, str(e))
+        #print traceback.format_exc()
+
+    return results
+
+
+def gen_sigs(cluster_type, sigmeta, cluster, file_directory, signature_directory, fdf):
+    """
+    Generate the signatures for the samples in a cluster.
+    Takes:
+    cluster_type (name) - for signature name
+    sigmeta - meta to include in all signatures
+    cluster - cluster label (number)
+    file_directory - where the sample files live
+    signature_directory - where the output Yara files go
+    fdf - temp dataframe to calculate attributes from
+    """
+    # You can have duplicate filenames for FAT style binaries (so unique filenames is generally < # of 'samples' printed
+    print "Creating signature for cluster: %s from %s samples" % (cluster, len(fdf.filename.unique()))
+    meta = sigmeta.copy()
+    filename = fdf.filename.value_counts().index.tolist()[0]
+    count = 0
+    yara_rule_name = cluster_type + "_cluster_" + str(cluster)
+    for sample in fdf.filename.unique().tolist():
+        meta['sample_'+str(count)] = sample
+        count += 1
+
+    sig = data_hacking.YaraMachoGenerator(file_directory + "/" + filename, samplename=yara_rule_name, meta=meta)
+
+    lc_cmds = []
+    lc_symtab = []
+    lc_dysymtab = []
+    lc_dyld_info = []
+    lc_segment = []
+    lc_segment_64 = []
+
+    for col in fdf.columns:
+        if len(fdf[col].unique()) == 1:
+            if fdf[col].unique()[0] != 0:
+                lower = [s for s in col if s.islower()]
+                if fdf[col].unique()[0] > 0 or (len(lower) == len(col)):
+                    if col.startswith('LC_'):
+                        lc_cmds.append(col)
+                    if col.startswith('lc_segment_'):
+                        lc_segment.append(col)
+                    if col.startswith('lc_segment_64_'):
+                        lc_segment_64.append(col)
+                    if col.startswith('lc_symtab_'):
+                        lc_symtab.append(col)
+                    if col.startswith('lc_dysymtab_'):
+                        lc_dysymtab.append(col)
+                    if col.startswith('lc_dyld_info_'):
+                        lc_dyld_info.append(col)
+                    #print "%s - %s" % (c, fdf[c].unique()[0])
+        else:
+            pass
+
+    if len(lc_symtab) > 0:
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_SYMTAB:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_SYMTAB']
+        lc_symtab = set([x[10:] for x in lc_symtab])
+        sig.add_symtab(lc_symtab)
+
+    if len(lc_dysymtab) > 0:
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_DYSYMTAB:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_DYSYMTAB']
+        lc_dysymtab = set([x[12:] for x in lc_dysymtab])
+        sig.add_dysymtab(lc_dysymtab)
+
+    if len(lc_dyld_info):
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_DYLD_INFO:']
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_DYLD_INFO_ONLY:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_DYLD_INFO']
+            lc_cmds = [x for x in lc_cmds if x != 'LC_DYLD_INFO_ONLY']
+        lc_dyld_info = set([x[13:] for x in lc_dyld_info])
+        sig.add_dyld_info(lc_dyld_info)
+
+    if len(lc_segment) > 0:
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_SEGMENT:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_SEGMENT']
+        lc_segment = set([x[12:] for x in lc_segment])
+        sig.add_segment(lc_segment)
+
+    if len(lc_segment_64) > 0:
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_SEGMENT_64:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_SEGMENT_64']
+        lc_segment_64 = set([x[14:] for x in lc_segment_64])
+        sig.add_segment(lc_segment_64)
+
+    if 'LC_VERSION_MIN_IPHONEOS' in lc_cmds:
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_VERSION_MIN_IPHONEOS:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_VERSION_MIN_IPHONEOS']
+        sig.add_version_min_macosx()
+
+    if 'LC_VERSION_MIN_MACOSX' in lc_cmds:
+        if include_cmd_size:
+            lc_cmds = [x for x in lc_cmds if not x in 'LC_VERSION_MIN_MACOSX:']
+        else:
+            lc_cmds = [x for x in lc_cmds if x != 'LC_VERSION_MIN_MACOSX']
+        sig.add_version_min_macosx()
+
+    if include_cmd_size:
+        [sig.add_lc_count(x.split(':')[0], int(fdf[x].unique()[0]), size=int(x.split(':')[1])) for x in lc_cmds]
+    else:
+        [sig.add_lc(x) for x in lc_cmds]
+
+    sig.get_signature(filename=signature_directory + '/' + yara_rule_name + '.yara', writesig=True)
+
+
+def main():
+    """
+    Main function, responsible for setting the features/dataframe, performing the clustering and generating the sigs
+    """
+    global include_cmd_size
+    file_directory = "./"
+    signature_directory = "./"
+    eoptions = ['Add cmdsize and type as a feature']
+    cluster_types = ['DBSCAN', 'MeanShift', 'KMeans']
+    features_list = []
+
+    parser = ArgumentParser(description="Process Mach-O files, perform clustering on them, and spit out Yara signatures.")
+    parser.add_argument('cluster types', nargs='+',
+                        help='Use the following algorithm to cluster. Options are: [' + ', '.join(cluster_types) + ']')
+    parser.add_argument('-d', '--directory',
+                        help='The directory that contains the Mach-O files to analyze')
+    parser.add_argument('-Y', '--yara_sig_directory',
+                        help='Where to put the Yara signatures')
+    parser.add_argument('-e', '--email',
+                        help='Email address to include in the rule files')
+    parser.add_argument('-a', '--author',
+                        help='Author name to include in the rule files')
+    parser.add_argument('-k', '--k_clusters',
+                        help='Number of clusters for KMeans clustering instead of estimating one')
+    parser.add_argument('-b', '--bandwidth',
+                        help='Specify a bandwidth for MeanShift clustering instead of estimating one')
+    parser.add_argument('-m', '--min_samples',
+                        help='Specify the minimum number of samples per cluster via DBSCAN instead of the default (3)')
+    parser.add_argument('-P', '--pca',
+                        help='Perform PCA to the specified number of components on the features (can help create more generic signatures via larger clusters), a value of 0 will cause this script to estimate a reasonable "n"')
+    parser.add_argument('-S', '--scale', action='store_true',
+                        help='Scale the feature values')
+    parser.add_argument('-E', '--experimental', action='store_true',
+                        help='Sets various(?) experimental options that affect signature generation: [' + ','.join(eoptions) + ']')
+    #parser.add_argument('-', '--',
+    #                    help='')
+    args = parser.parse_args()
+
+    if args.directory:
+        file_directory = args.directory
+
+    if args.experimental:
+        include_cmd_size = True
+
+    for ctype in vars(args)['cluster types']:
+        if not ctype in cluster_types:
+            parser.print_help()
+            sys.exit(1)
+
+    meta = {}
+    if args.author:
+        meta['author'] = args.author
+    if args.email:
+        meta['email'] = args.email
+
+    if args.yara_sig_directory:
+        signature_directory = args.yara_sig_directory
+
+    file_list = glob.glob(file_directory + '/*')
+    for filename in file_list:
+        if os.path.isfile(filename):
+            features_list.extend(e_features(filename))
+    print "Ingest Information"
+    print "------------------"
+    print "Files:", len(file_list)
+    print "Number of feature vectors:", len(features_list)
+
+    if len(file_list) == 0 or len(features_list) == 0:
+        print "There was a problem reading in files and gathering features to cluster on."
+        sys.exit(2)
+
+    dataframe = pd.DataFrame.from_records(features_list)
+    for col in dataframe.columns:
+        if col[0:3] in ['LC_']:
+            dataframe[col].fillna(0, inplace=True)
+
+    dataframe.fillna(-1, inplace=True)
+
+    cols = [x for x in dataframe.columns.tolist() if x != 'filename']
+    print "Features per feature vector:", len(cols)
+
+    X = dataframe.as_matrix(cols)
+
+    if args.scale:
+        print "Scaling features"
+        X = scale(X)
+
+    if args.pca:
+        print "Performing PCA"
+        if int(args.pca) == 0:
+            pca = PCA().fit(X)
+            n_comp = len([x for x in pca.explained_variance_ if x > 1e0])
+            print "Estimating number of components w/explained variance > 1: %s" % n_comp
+            X = PCA(n_components=n_comp).fit_transform(X)
+        else:
+            X = PCA(n_components=int(args.pca)).fit_transform(X)
+
+    if 'DBSCAN' in vars(args)['cluster types']:
+        cluster_df = pd.DataFrame()
+        from sklearn.cluster import DBSCAN
+
+        min_samples = 3
+        if args.min_samples:
+            min_samples = int(args.min_samples)
+        dbscan = DBSCAN(min_samples=min_samples)
+        dbscan.fit(X)
+        labels1 = dbscan.labels_
+        dataframe['cluster'] = labels1
+        labels1_u = np.unique(labels1)
+        nclusters = len(labels1_u)
+        cluster_df = dataframe[['filename', 'cluster']]
+
+        print
+        print "DBSCAN Cluster Information"
+        print "--------------------------"
+        print "Number of clusters: %d" % nclusters
+        print "Labeled samples: %s" % cluster_df[cluster_df['cluster'] != -1].filename.value_counts().sum()
+        print "Unlabeled samples: %s" % cluster_df[cluster_df['cluster'] == -1].filename.value_counts().sum()
+        print
+        # Remove the unclustered samples
+        cluster_df = cluster_df[cluster_df['cluster'] != -1]
+        for cluster in cluster_df.cluster.unique().tolist():
+            fdf = pd.DataFrame()
+            cluster = int(cluster)
+            for fname in cluster_df[cluster_df['cluster'] == cluster].filename.tolist():
+                fdf = fdf.append(dataframe[dataframe['filename'] == fname], ignore_index=True)
+            gen_sigs('DBSCAN', meta, cluster, file_directory, signature_directory, fdf)
+
+    if 'MeanShift' in vars(args)['cluster types']:
+        cluster_df = pd.DataFrame()
+        from sklearn.cluster import MeanShift, estimate_bandwidth
+
+        ebw = 0
+        if args.bandwidth:
+            ebw = int(args.bandwidth)
+        else:
+            ebw = estimate_bandwidth(X)
+        ms1 = MeanShift(bandwidth=ebw)
+        ms1.fit(X)
+
+        labels1 = ms1.labels_
+        dataframe['cluster'] = labels1
+        labels1_u = np.unique(labels1)
+        nclusters = len(labels1_u)
+        cluster_df = dataframe[['filename', 'cluster']]
+
+        print
+        print "MeanShift Cluster Information"
+        print "--------------------------"
+        print "Estimated Bandwidth: %s" % ebw
+        print "Number of clusters: %d" % nclusters
+        print
+
+        for cluster in cluster_df.cluster.unique().tolist():
+            fdf = pd.DataFrame()
+            cluster = int(cluster)
+            for fname in cluster_df[cluster_df['cluster'] == cluster].filename.tolist():
+                fdf = fdf.append(dataframe[dataframe['filename'] == fname], ignore_index=True)
+            gen_sigs('MeanShift', meta, cluster, file_directory, signature_directory, fdf)
+
+    if 'KMeans' in vars(args)['cluster types']:
+        cluster_df = pd.DataFrame()
+        from sklearn.cluster import KMeans
+
+        #rule of thumb of k = sqrt(#samples/2)
+        k_clusters = int(math.sqrt(int(len(features_list)/2)))
+        if args.k_clusters:
+            k_clusters = int(args.k_clusters)
+
+
+        from scipy.cluster.vq import kmeans,vq
+        from scipy.spatial.distance import cdist
+        
+        ##### cluster data into K=1..10 clusters #####
+        #K, KM, centroids,D_k,cIdx,dist,avgWithinSS = kmeans.run_kmeans(X,10)
+        
+        K = range(2,len(X),5)
+        
+        # scipy.cluster.vq.kmeans
+        KM = [kmeans(X,k) for k in K] # apply kmeans 1 to 10
+        centroids = [cent for (cent,var) in KM]   # cluster centroids
+        
+        D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
+        
+        cIdx = [np.argmin(D,axis=1) for D in D_k]
+        dist = [np.min(D,axis=1) for D in D_k]
+        avgWithinSS = [sum(d)/X.shape[0] for d in dist]  
+        kIdx = 2
+        # plot elbow curve
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+        ax.plot(K, avgWithinSS, 'b*-')
+        ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, 
+              markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
+        plt.grid(True)
+        plt.xlabel('Number of clusters')
+        plt.ylabel('Average within-cluster sum of squares')
+        tt = plt.title('Elbow for K-Means clustering')  
+        ax = fig.add_subplot(1, 1, 1, projection='3d')
+        ax.scatter(X[:,0], X[:,1], X[:,2], c=df['cluster'], s=50)
+        ax.set_title(cluster_type + " Clusters")
+        fig.set_size_inches(24,18)
+        fig.set_dpi(400)
+        fig.savefig(cluster_type + '.png', bbox_inches='tight')
+
+        kmeans = KMeans(n_clusters=k_clusters)
+        kmeans.fit(X)
+        labels1 = kmeans.labels_
+        dataframe['cluster'] = labels1
+        labels1_u = np.unique(labels1)
+        nclusters = len(labels1_u)
+        cluster_df = dataframe[['filename', 'cluster']]
+
+        print
+        print "KMeans Cluster Information"
+        print "--------------------------"
+        print "Number of clusters: %d" % nclusters
+        print
+
+        for cluster in cluster_df.cluster.unique().tolist():
+            fdf = pd.DataFrame()
+            cluster = int(cluster)
+            for fname in cluster_df[cluster_df['cluster'] == cluster].filename.tolist():
+                fdf = fdf.append(dataframe[dataframe['filename'] == fname], ignore_index=True)
+            gen_sigs('KMeans', meta, cluster, file_directory, signature_directory, fdf)
+        save_graphic(dataframe, cols, signature_directory, "KMeans")
+
+if __name__ == "__main__":
+    main()
diff --git a/clusterfck/clusterfck_pefile b/clusterfck/clusterfck_pefile
new file mode 100755
index 0000000..5799027
--- /dev/null
+++ b/clusterfck/clusterfck_pefile
@@ -0,0 +1,431 @@
+#!/usr/bin/python
+"""
+This script will parse a group of PE files, retreive a subset of features/attributes from each file,
+perform various clustering algorithms on the features, and then given the files in each cluster will
+generate Yara signatures that can help identify similar files.
+"""
+
+import os
+import sys
+import math
+import glob
+import struct
+import pefile
+import numpy as np
+import pandas as pd
+from argparse import ArgumentParser
+import data_hacking
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from sklearn.preprocessing import scale
+from sklearn.decomposition import PCA
+
+def save_graphic(df, cols, signature_directory, cluster_type):
+    """
+    Print a simple 3D graph of the cluster layout after using PCA to reduce the clusters to 
+    three dimensions.
+    """
+    print "Generating graphic of cluster representation"
+    X = df.as_matrix(cols)
+    X = scale(X)
+    X = PCA(n_components=3).fit_transform(X)
+    fig = plt.figure(figsize=plt.figaspect(1))
+    ax = fig.add_subplot(1, 1, 1, projection='3d')
+    ax.scatter(X[:,0], X[:,1], X[:,2], c=df['cluster'], s=50)
+    ax.set_title(cluster_type + " Clusters")
+    fig.set_size_inches(24,18)
+    fig.set_dpi(400)
+    fig.savefig(cluster_type + '.png', bbox_inches='tight')
+
+def e_features(filename):
+    """
+    Extract the features from the given filename.
+
+    Uses pefile to do the file parsing, only a subset of features are chosen from the parsed file
+    """
+    features = {}
+    try:
+        pe = pefile.PE(filename, fast_load=False)
+
+        # File Header
+        features['filename'] = os.path.basename(filename)
+        features['machine'] = pe.FILE_HEADER.Machine
+        features['number of sections'] = pe.FILE_HEADER.NumberOfSections
+        features['compile date'] = pe.FILE_HEADER.TimeDateStamp
+        features['pointer to symbol table'] = pe.FILE_HEADER.PointerToSymbolTable
+        features['number of symbols'] = pe.FILE_HEADER.NumberOfSymbols
+        features['size of optional header'] = pe.FILE_HEADER.SizeOfOptionalHeader
+        features['characteristics'] = pe.FILE_HEADER.Characteristics
+
+        # Optional Header
+        features['magic'] = pe.OPTIONAL_HEADER.Magic
+        features['major linker version'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
+        features['minor linker version'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
+        features['size of code'] = pe.OPTIONAL_HEADER.SizeOfCode
+        features['size init data'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
+        features['size uninit data'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
+        features['entry point address'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
+        features['base of code'] = pe.OPTIONAL_HEADER.BaseOfCode
+        if hasattr(pe.OPTIONAL_HEADER, 'BaseOfData'):
+            features['base of data'] = pe.OPTIONAL_HEADER.BaseOfData
+        features['image base'] = float(pe.OPTIONAL_HEADER.ImageBase)
+        features['section alignment'] = pe.OPTIONAL_HEADER.SectionAlignment
+        features['file alignment'] = pe.OPTIONAL_HEADER.FileAlignment
+        features['major operating system version'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
+        features['minor operating system version'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
+        features['major image version'] = pe.OPTIONAL_HEADER.MajorImageVersion
+        features['minor image version'] = pe.OPTIONAL_HEADER.MinorImageVersion
+        features['major subsystem version'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
+        features['minor subsystem version'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
+        features['size of image'] = pe.OPTIONAL_HEADER.SizeOfImage
+        features['size of headers'] = pe.OPTIONAL_HEADER.SizeOfHeaders
+        features['checksum'] = pe.OPTIONAL_HEADER.CheckSum
+        features['subsystem'] = pe.OPTIONAL_HEADER.Subsystem
+        features['dll charactersitics'] = pe.OPTIONAL_HEADER.DllCharacteristics
+        features['size of stack reserve'] = float(pe.OPTIONAL_HEADER.SizeOfStackReserve)
+        features['size of stack commit'] = float(pe.OPTIONAL_HEADER.SizeOfStackCommit)
+        features['size of heap reserve'] = float(pe.OPTIONAL_HEADER.SizeOfHeapReserve)
+        features['size of heap commit'] = float(pe.OPTIONAL_HEADER.SizeOfHeapCommit)
+        features['loader flags'] = pe.OPTIONAL_HEADER.LoaderFlags
+        features['number of rva and sizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
+
+        # Data directory
+        datadirs = {0: 'export table', 1: 'import table',
+            2: 'resource table', 3: 'exception table',
+            5: 'base relocation', 6: 'debug',
+            9: 'tls table', 12: 'import address table'}
+
+        data_directories = {}
+        for idx, datadir_name in datadirs.items():
+            if len(pe.OPTIONAL_HEADER.DATA_DIRECTORY) <= idx:
+                continue
+
+            directory = pe.OPTIONAL_HEADER.DATA_DIRECTORY[idx]
+            features['data dir ' + datadir_name + ' size'] = directory.Size
+            features['data dir ' + datadir_name + ' rva'] = directory.VirtualAddress
+
+
+        # Resource Entry (grab first two)
+        if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
+            resources = []
+            index = 0
+            for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
+                if resource_type.name is not None:
+                    name = "%s" % resource_type.name
+                else:
+                    name = "%s" % pefile.RESOURCE_TYPE.get(resource_type.struct.Id)
+                if name == None:
+                    name = "%d" % resource_type.struct.Id
+                if hasattr(resource_type, 'directory'):
+                    for resource_id in resource_type.directory.entries:
+                        if hasattr(resource_id, 'directory'):
+                            for resource_lang in resource_id.directory.entries:
+                                resource = {}
+                                if hasattr(resource_lang, 'data'):
+                                    try:
+                                        features['resource ' + str(index) + ' rva'] = resource_lang.data.struct.OffsetToData
+                                        features['resource ' + str(index) + ' size'] = resource_lang.data.struct.Size
+                                        features['resource ' + str(index) + ' lang'] = resource_lang.data.lang
+                                        index += 1
+                                        if index == 2:
+                                            break
+                                    except pefile.PEFormatError as pfe:
+                                        pass
+                        if index == 2:
+                            break
+                if index == 2:
+                    break
+
+    except Exception as e:
+        print "Error processing %s - %s" % (filename, str(e))
+        #print traceback.format_exc()
+
+    return features
+
+
+def gen_sigs(cluster_type, sigmeta, cluster, file_directory, signature_directory, fdf):
+    """
+    Generate the signatures for the samples in a cluster. 
+    Takes:
+    cluster_type (name) - for signature name
+    sigmeta - meta to include in all signatures
+    cluster - cluster label (number)
+    file_directory - where the sample files live
+    signature_directory - where the output Yara files go
+    fdf - temp dataframe to calculate attributes from
+    """
+    print "Creating signature for cluster: %s from %s samples" % (cluster, len(fdf.filename.unique()))
+    meta = sigmeta.copy()
+    filename = fdf.filename.value_counts().index.tolist()[0]
+    print filename
+    count = 0
+    yara_rule_name = cluster_type + "_cluster_" + str(cluster)
+    for sample in fdf.filename.unique().tolist():
+        meta['sample_'+str(count)] = sample
+        count += 1
+
+
+    file_header_columns = ["pointer to symbol table", "characteristics", "number of symbols", "size of optional header",
+                            "machine", "compile date", "number of sections"]
+
+    optional_header_columns = ["subsystem", "major image version", 
+                               "major operating system version", "section alignment", "loader flags",
+                               "minor subsystem version", "major linker version",
+                               "size of code", "size of image", "number of rva and sizes", "dll charactersitics",
+                               "file alignment", "minor linker version", "base of code",
+                               "size uninit data", "entry point address", "size init data", "major subsystem version",
+                               "magic", "checksum", "minor image version",
+                               "minor operating system version", "size of headers", "base of data",
+                               "data dir base relocation rva", "data dir base relocation size", "data dir debug rva",
+                               "data dir debug size", "data dir exception table rva", "data dir exception table size",
+                               "data dir export table rva", "data dir export table size", "data dir import address table rva",
+                               "data dir import address table rva", "data dir import address table size",
+                               "data dir import table rva", "data dir import table size", "data dir import table size",
+                               "data dir resource table rva", "data dir resource table size", "data dir tls table rva",
+                               "data dir tls table size"]
+
+    qword_columns = ['image base', 'size of stack reserve', 'size of stack commit', 'size of heap reserve', 'size of heap commit']
+
+    sig = data_hacking.YaraPEGenerator(file_directory + "/" + filename, samplename=yara_rule_name, meta=meta)
+
+    file_header = []
+    optional_header = {}
+
+    for col in fdf.columns:
+        if len(fdf[col].unique()) == 1:
+            if fdf[col].unique()[0] != -1:
+                lower = [s for s in col if s.islower()]
+                if fdf[col].unique()[0] != -1 or (len(lower) == len(col)):
+                    if col in file_header_columns:
+                        file_header.append(col)
+                    if col in optional_header_columns:
+                        optional_header[col] = struct.pack("<I", int(fdf[col].unique()[0])).encode('hex')
+                    if col in qword_columns:
+                        if fdf['magic'][0] == 0x20b:
+                            optional_header[col] = struct.pack("<Q", int(fdf[col].unique()[0])).encode('hex')
+                        else:
+                            optional_header[col] = struct.pack("<I", int(fdf[col].unique()[0])).encode('hex')
+
+
+        if len(fdf[col].unique()) > 1:
+            if col not in optional_header_columns:
+                continue
+
+            if type(fdf[col].unique()[0]) == str or len(fdf[col].unique()) > 9:
+                continue
+
+            u = []
+            z = []
+            for value in fdf[col].unique():
+                u.append(struct.pack("<I", value).encode("hex"))
+
+            for d in zip(*u):
+                match = True
+                for idx in range(1,len(d)):
+                    if d[0] != d[idx]:
+                        match = False
+                        break
+                if match:
+                    z.append(d[0])
+                else:
+                    z.append('?')
+            string = ''.join(z)
+            if string != '????????':
+                optional_header[col] = string
+
+    if len(file_header) > 0:
+        sig.add_file_header(file_header)
+
+    if len(optional_header) > 0:
+        sig.add_optional_header_with_values(optional_header)
+        sig.get_signature(filename=signature_directory + '/' + yara_rule_name + '.yara', writesig=True)
+
+
+def main():
+    """
+    Main function, responsible for setting the features/dataframe, performing the clustering and generating the sigs
+    """
+    global include_cmd_size
+    file_directory = "./"
+    signature_directory = "./"
+    eoptions = ['Add cmdsize and type as a feature']
+    cluster_types = ['DBSCAN', 'MeanShift', 'KMeans']
+    features_list = []
+
+    parser = ArgumentParser(description="Process Mach-O files, perform clustering on them, and spit out Yara signatures.")
+    parser.add_argument('cluster types', nargs='+',
+                        help='Use the following algorithm to cluster. Options are: [' + ','.join(cluster_types) + ']')
+    parser.add_argument('-d', '--directory',
+                        help='directory that contains the PE files to analyze')
+    parser.add_argument('-Y', '--yara_sig_directory',
+                        help='where to put the Yara signatures')
+    parser.add_argument('-E', '--experimental',
+                        help='sets various(?) experimental options that affect signature generation: [' + ','.join(eoptions) + ']')
+    parser.add_argument('-e', '--email',
+                        help='email address to include in the rule files')
+    parser.add_argument('-a', '--author',
+                        help='author name to include in the rule files')
+    parser.add_argument('-k', '--k_clusters',
+                        help='number of clusters for KMeans clustering instead of estimating one')
+    parser.add_argument('-b', '--bandwidth',
+                        help='specify a bandwidth for MeanShift clustering instead of estimating one')
+    parser.add_argument('-m', '--min_samples',
+                        help='specify the minimum number of samples per cluster via DBSCAN instead of the default (3)')
+    parser.add_argument('-P', '--pca',
+                        help='Perform PCA to the specified number of components on the features (can help create more generic signatures via larger clusters), a value of 0 will cause this script to estimate a reasonable "n"')
+    parser.add_argument('-S', '--scale', action='store_true',
+                        help='Scale the feature values')
+
+    #                    help='')
+    args = parser.parse_args()
+
+    if args.directory:
+        file_directory = args.directory
+
+    if args.experimental:
+        include_cmd_size = True
+
+    for ctype in vars(args)['cluster types']:
+        if not ctype in cluster_types:
+            parser.print_help()
+            sys.exit(1)
+
+    meta = {}
+    if args.author:
+        meta['author'] = args.author
+    if args.email:
+        meta['email'] = args.email
+
+    file_list = glob.glob(file_directory + '/*')
+    for filename in file_list:
+        if os.path.isfile(filename):
+            features_list.append(e_features(filename))
+    print "Ingest Information"
+    print "------------------"
+    print "Files:", len(file_list)
+
+    if len(file_list) == 0:
+        print "There was a problem reading in files and gathering features to cluster on."
+        sys.exit(2)
+
+    dataframe = pd.DataFrame.from_records(features_list)
+    for col in dataframe.columns:
+        if 'resource' in col[0:7]:
+            dataframe[col].fillna(-1, inplace=True)
+
+    dataframe.fillna(-1, inplace=True)
+
+    cols = [x for x in dataframe.columns.tolist() if x != 'filename']
+    print "Features per feature vector:", len(cols)
+
+    X = dataframe.as_matrix(cols)
+
+    if args.scale:
+        print "Scaling features"
+        X = scale(X)
+
+    if args.pca:
+        print "Performing PCA"
+        if int(args.pca) == 0:
+            pca = PCA().fit(X)
+            n_comp = len([x for x in pca.explained_variance_ if x > 1e0])
+            print "Estimating number of components w/explained variance > 1: %s" % n_comp
+            X = PCA(n_components=n_comp).fit_transform(X)
+        else:
+            X = PCA(n_components=int(args.pca)).fit_transform(X)
+
+    if 'DBSCAN' in vars(args)['cluster types']:
+        cluster_df = pd.DataFrame()
+        from sklearn.cluster import DBSCAN
+
+        min_samples = 3
+        if args.min_samples:
+            min_samples = int(args.min_samples)
+        dbscan = DBSCAN(min_samples=min_samples)
+        dbscan.fit(X)
+        labels1 = dbscan.labels_
+        dataframe['cluster'] = labels1
+        labels1_u = np.unique(labels1)
+        nclusters = len(labels1_u)
+        cluster_df = dataframe[['filename', 'cluster']]
+
+        print
+        print "DBSCAN Cluster Information"
+        print "--------------------------"
+        print "Number of clusters: %d" % nclusters
+        print "Labeled samples: %s" % cluster_df[cluster_df['cluster'] != -1].filename.value_counts().sum()
+        print "Unlabeled samples: %s" % cluster_df[cluster_df['cluster'] == -1].filename.value_counts().sum()
+        print
+        # Remove the unclustered samples
+        cluster_df = cluster_df[cluster_df['cluster'] != -1]
+        for cluster in cluster_df.cluster.unique().tolist():
+            fdf = pd.DataFrame()
+            cluster = int(cluster)
+            for fname in cluster_df[cluster_df['cluster'] == cluster].filename.tolist():
+                fdf = fdf.append(dataframe[dataframe['filename'] == fname], ignore_index=True)
+            gen_sigs('DBSCAN', meta, cluster, file_directory, signature_directory, fdf)
+
+    if 'MeanShift' in vars(args)['cluster types']:
+        cluster_df = pd.DataFrame()
+        from sklearn.cluster import MeanShift, estimate_bandwidth
+
+        ebw = 0
+        if args.bandwidth:
+            ebw = int(args.bandwidth)
+        else:
+            ebw = estimate_bandwidth(X)
+        ms1 = MeanShift(bandwidth=ebw)
+        ms1.fit(X)
+
+        labels1 = ms1.labels_
+        dataframe['cluster'] = labels1
+        labels1_u = np.unique(labels1)
+        nclusters = len(labels1_u)
+        cluster_df = dataframe[['filename', 'cluster']]
+
+        print
+        print "MeanShift Cluster Information"
+        print "--------------------------"
+        print "Estimated Bandwidth: %s" % ebw
+        print "Number of clusters: %d" % nclusters
+        print
+
+        for cluster in cluster_df.cluster.unique().tolist():
+            fdf = pd.DataFrame()
+            cluster = int(cluster)
+            for fname in cluster_df[cluster_df['cluster'] == cluster].filename.tolist():
+                fdf = fdf.append(dataframe[dataframe['filename'] == fname], ignore_index=True)
+            gen_sigs('MeanShift', meta, cluster, file_directory, signature_directory, fdf)
+
+    if 'KMeans' in vars(args)['cluster types']:
+        cluster_df = pd.DataFrame()
+        from sklearn.cluster import KMeans
+
+        #rule of thumb of k = sqrt(#samples/2)
+        k_clusters = int(math.sqrt(int(len(features_list)/2)))
+        if args.k_clusters:
+            k_clusters = int(args.k_clusters)
+
+        kmeans = KMeans(n_clusters=k_clusters)
+        kmeans.fit(X)
+        labels1 = kmeans.labels_
+        dataframe['cluster'] = labels1
+        labels1_u = np.unique(labels1)
+        nclusters = len(labels1_u)
+        cluster_df = dataframe[['filename', 'cluster']]
+
+        print
+        print "KMeans Cluster Information"
+        print "--------------------------"
+        print "Number of clusters: %d" % nclusters
+        print
+
+        for cluster in cluster_df.cluster.unique().tolist():
+            fdf = pd.DataFrame()
+            cluster = int(cluster)
+            for fname in cluster_df[cluster_df['cluster'] == cluster].filename.tolist():
+                fdf = fdf.append(dataframe[dataframe['filename'] == fname], ignore_index=True)
+            gen_sigs('KMeans', meta, cluster, file_directory, signature_directory, fdf)
+
+if __name__ == "__main__":
+    main()