Skip to content

Commit

Permalink
Merge pull request #204 from bacpop/v250-candidate
Browse files Browse the repository at this point in the history
Release for v2.5.0
  • Loading branch information
johnlees authored Aug 25, 2022
2 parents f2f0fcf + ce01979 commit 2e593a4
Show file tree
Hide file tree
Showing 94 changed files with 6,834 additions and 2,512 deletions.
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
build
dist
*.egg-info
.git
test
docs
7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.18)
cmake_minimum_required(VERSION 3.16)
project(poppunk_refine)
set(CMAKE_CXX_STANDARD 14)

Expand All @@ -25,9 +25,12 @@ if(DEFINED ENV{CONDA_PREFIX})
endif()

# Add libraries
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/vendor)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)

execute_process(COMMAND pybind11-config --cmakedir OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE pybind11_DIR)
find_package(pybind11 REQUIRED)
find_package (Eigen3 3.3 REQUIRED NO_MODULE)
find_package(Eigen3 3.3 REQUIRED NO_MODULE)
#find_package(OpenMP) # This links system openmp if present - conda sorts out rpath but take care

# Define python library target
Expand Down
5 changes: 4 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
recursive-include scripts *.py
recursive-include PopPUNK/data *.gz
recursive-include PopPUNK/data *.gz
recursive-include vendor *
recursive-include src *
include CMakeLists.txt
6 changes: 3 additions & 3 deletions PopPUNK/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

'''PopPUNK (POPulation Partitioning Using Nucleotide Kmers)'''

__version__ = '2.4.6'
__version__ = '2.5.0'

# Minimum sketchlib version
SKETCHLIB_MAJOR = 1
SKETCHLIB_MINOR = 7
SKETCHLIB_MAJOR = 2
SKETCHLIB_MINOR = 0
SKETCHLIB_PATCH = 0
216 changes: 137 additions & 79 deletions PopPUNK/__main__.py

Large diffs are not rendered by default.

679 changes: 395 additions & 284 deletions PopPUNK/assign.py

Large diffs are not rendered by default.

Binary file added PopPUNK/data/microreact_example.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion PopPUNK/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
def get_options():

parser = argparse.ArgumentParser(description='Get information about a PopPUNK database',
prog='poppunk_db_info')
prog='poppunk_info')

# input options
parser.add_argument('--db',
Expand Down
177 changes: 177 additions & 0 deletions PopPUNK/mandrake.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python
# vim: set fileencoding=<utf-8> :
# Copyright 2018-2022 John Lees and Nick Croucher

import os
import sys
import numpy as np
from functools import partial
import random

import pp_sketchlib
from SCE import wtsne
try:
from SCE import wtsne_gpu_fp64
gpu_fn_available = True
except ImportError:
gpu_fn_available = False

from .utils import readPickle

def generate_embedding(seqLabels, accMat, perplexity, outPrefix, overwrite, kNN = 50,
maxIter = 1000000, n_threads = 1, use_gpu = False, device_id = 0):
"""Generate t-SNE projection using accessory distances
Writes a plot of t-SNE clustering of accessory distances (.dot)
Args:
seqLabels (list)
Processed names of sequences being analysed.
accMat (numpy.array)
n x n array of accessory distances for n samples.
perplexity (int)
Perplexity parameter passed to t-SNE
outPrefix (str)
Prefix for all generated output files, which will be placed in
`outPrefix` subdirectory
overwrite (bool)
Overwrite existing output if present
(default = False)
kNN (int)
Number of neigbours to use with SCE (cannot be > n_samples)
(default = 50)
maxIter (int)
Number of iterations to run
(default = 1000000)
n_threads (int)
Number of CPU threads to use
(default = 1)
use_gpu (bool)
Whether to use GPU libraries
device_id (int)
Device ID of GPU to be used
(default = 0)
Returns:
mandrake_filename (str)
Filename with .dot of embedding
"""
# generate accessory genome distance representation
mandrake_filename = outPrefix + "/" + os.path.basename(outPrefix) + "_perplexity" + str(perplexity) + "_accessory_mandrake.dot"
if os.path.isfile(mandrake_filename) and not overwrite:
sys.stderr.write("Mandrake analysis already exists; add --overwrite to replace\n")
else:
sys.stderr.write("Running mandrake\n")
kNN = max(kNN, len(seqLabels) - 1)
I, J, dists = pp_sketchlib.sparsifyDists(distMat=accMat, distCutoff=0, kNN=kNN)

# Set up function call with either CPU or GPU
weights = np.ones((len(seqLabels)))
random.Random()
seed = random.randint(0, 2**32)
if use_gpu and gpu_fn_available:
sys.stderr.write("Running on GPU\n")
n_workers = 65536
maxIter = round(maxIter / n_workers)
wtsne_call = partial(wtsne_gpu_fp64,
perplexity=perplexity,
maxIter=maxIter,
blockSize=128,
n_workers=n_workers,
nRepuSamp=5,
eta0=1,
bInit=0,
animated=False,
cpu_threads=n_threads,
device_id=device_id,
seed=seed)
else:
sys.stderr.write("Running on CPU\n")
maxIter = round(maxIter / n_threads)
wtsne_call = partial(wtsne,
perplexity=perplexity,
maxIter=maxIter,
nRepuSamp=5,
eta0=1,
bInit=0,
animated=False,
n_workers=n_threads,
n_threads=n_threads,
seed=seed)

# Run embedding with C++ extension
embedding_result = wtsne_call(I, J, dists, weights)
embedding = np.array(embedding_result.get_embedding()).reshape(-1, 2)

# print dot file
with open(mandrake_filename, 'w') as nFile:
nFile.write("graph G { ")
for s, seqLabel in enumerate(seqLabels):
nFile.write('"' + seqLabel + '"' +
'[x='+str(5*float(embedding[s][0]))+',y='+str(5*float(embedding[s][1]))+']; ')
nFile.write("}\n")

return mandrake_filename

# command line parsing
def get_options():

import argparse

parser = argparse.ArgumentParser(description='Run mandrake embedding of accessory distances',
prog='poppunk_mandrake')

# input options
parser.add_argument('--distances', required=True, help='Prefix of input pickle and numpy file of pre-calculated '
'distances')
parser.add_argument('--output', required=True, help='Name of output file')
parser.add_argument('--perplexity', help='Perplexity used to generate projection [default = 30]', type=int, default=30)
parser.add_argument('--knn', help='Number of neighbours used to generate t-SNE projection [default = 50]', type=int, default=50)
parser.add_argument('--iter', help='Number of iterations [default = 1000000]', type=int, default=1000000)
parser.add_argument('--cpus', help="Number of CPU threads", type=int, default=1)
parser.add_argument('--use-gpu', help='Whether to use GPU libraries for t-SNE calculation', default = False, action='store_true')
parser.add_argument('--device-id', help="Device ID of GPU to use", type=int, default=0)

return parser.parse_args()


# main code
def main():

# Check input ok
args = get_options()

if not os.path.isdir(args.output):
try:
os.makedirs(args.output)
except OSError:
sys.stderr.write("Cannot create output directory\n")
sys.exit(1)

# load saved distance matrix
refList, queryList, self, distMat = readPickle(args.distances,
enforce_self=True)

# process list of file names
seqLabels = [r.split('/')[-1].split('.')[0] for r in refList]

# generate accMat
accMat = pp_sketchlib.longToSquare(distVec=distMat[:, [1]],
num_threads=args.cpus)

# generate accessory genome distance representation
generate_embedding(seqLabels,
accMat,
args.perplexity,
args.output,
overwrite=True,
kNN=args.knn,
maxIter=args.iter,
n_threads=args.cpus,
use_gpu = args.use_gpu,
device_id = args.device_id)

if __name__ == "__main__":
main()

sys.exit(0)
27 changes: 16 additions & 11 deletions PopPUNK/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@
# additional
import numpy as np
import random
import operator
import pickle
import shutil
import re
from sklearn import utils
import scipy.optimize
from scipy.spatial.distance import euclidean
from scipy import stats
import scipy.sparse
import hdbscan
Expand Down Expand Up @@ -770,7 +768,10 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
# Get starting point
model.no_scale()
if startFile:
self.mean0, self.mean1 = readManualStart(startFile)
self.mean0, self.mean1, scaled = readManualStart(startFile)
if not scaled:
self.mean0 /= self.scale
self.mean1 /= self.scale
elif model.type == 'dbscan':
sys.stderr.write("Initial model-based network construction based on DBSCAN fit\n")
self.mean0 = model.cluster_means[model.within_label, :]
Expand All @@ -783,8 +784,9 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
raise RuntimeError("Unrecognised model type")

# Main refinement in 2D
scaled_X = X / self.scale
self.optimal_x, self.optimal_y, optimal_s = \
refineFit(X/self.scale,
refineFit(scaled_X,
sample_names,
self.mean0,
self.mean1,
Expand All @@ -802,10 +804,12 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi

# Output clusters at more positions if requested
if multi_boundary > 1:
multi_refine(X/self.scale,
sys.stderr.write("Creating multiple boundary fits\n")
multi_refine(scaled_X,
sample_names,
self.mean0,
self.mean1,
self.scale,
optimal_s,
multi_boundary,
self.outPrefix,
Expand All @@ -822,7 +826,7 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
sys.stderr.write("Refining " + dist_type + " distances separately\n")
# optimise core distance boundary
core_boundary, accessory_boundary, s = \
refineFit(X/self.scale,
refineFit(scaled_X,
sample_names,
self.mean0,
self.mean1,
Expand All @@ -844,7 +848,6 @@ def fit(self, X, sample_names, model, max_move, min_move, startFile = None, indi
print(e)
sys.stderr.write("Could not separately refine core and accessory boundaries. "
"Using joint 2D refinement only.\n")
self.indiv_fitted = True
y = self.assign(X)
return y

Expand Down Expand Up @@ -1081,9 +1084,10 @@ def fit(self, X, accessory):
for rank in self.ranks:
row, col, data = \
pp_sketchlib.sparsifyDists(
pp_sketchlib.longToSquare(X[:, [self.dist_col]], self.threads),
0,
rank
distMat=pp_sketchlib.longToSquare(distVec=X[:, [self.dist_col]],
num_threads=self.threads),
distCutoff=0,
kNN=rank
)
self.__save_sparse__(data, row, col, rank, sample_size, X.dtype)

Expand Down Expand Up @@ -1199,7 +1203,8 @@ def extend(self, qqDists, qrDists):
qrDists = cp.array(qrDists)

# Reshape qq and qr dist matrices
qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads)
qqSquare = pp_sketchlib.longToSquare(distVec=qqDists[:, [self.dist_col]],
num_threads=self.threads)
qqSquare[qqSquare < epsilon] = epsilon

n_ref = self.nn_dists[self.ranks[0]].shape[0]
Expand Down
Loading

0 comments on commit 2e593a4

Please sign in to comment.