Skip to content

Commit

Permalink
TSP solver bug fix (#1480)
Browse files Browse the repository at this point in the history
Addressed comments from: #1450
In addition:

1. Made raft handle const.
2. Split algorithm into multiple kernel calls and updated timers.
3. Removed global symbols to retrieve results in struct.

Authors:
  - Hugo Linsenmaier (@hlinsen)

Approvers:
  - Brad Rees (@BradReesWork)
  - Andrei Schaffer (@aschaffer)
  - Alex Fender (@afender)

URL: #1480
  • Loading branch information
hlinsen authored Mar 26, 2021
1 parent 76fad0e commit b85bd47
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 219 deletions.
2 changes: 1 addition & 1 deletion cpp/include/algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
* @param[out] route Device array containing the returned route.
*
*/
float traveling_salesperson(raft::handle_t &handle,
float traveling_salesperson(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
Expand Down
202 changes: 109 additions & 93 deletions cpp/src/traversal/tsp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
#include <numeric>
#include <raft/spatial/knn/knn.hpp>

#include <utilities/high_res_timer.hpp>

#include "tsp.hpp"
#include "tsp_solver.hpp"

namespace cugraph {
namespace detail {

TSP::TSP(raft::handle_t &handle,
TSP::TSP(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
Expand All @@ -50,139 +52,153 @@ TSP::TSP(raft::handle_t &handle,
max_threads_(handle_.get_device_properties().maxThreadsPerBlock),
warp_size_(handle_.get_device_properties().warpSize),
sm_count_(handle_.get_device_properties().multiProcessorCount),
restart_batch_(4096)
restart_batch_(8192),
neighbors_vec_((k_ + 1) * nodes_, stream_),
work_vec_(restart_batch_ * ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_),
stream_),
best_x_pos_vec_(1, stream_),
best_y_pos_vec_(1, stream_),
best_route_vec_(1, stream_)
{
allocate();
setup();
}

void TSP::allocate()
void TSP::setup()
{
// Scalars
mylock_ = mylock_scalar_.data();
best_tour_ = best_tour_scalar_.data();
climbs_ = climbs_scalar_.data();
mylock_ = mylock_scalar_.data();

// Vectors
neighbors_vec_.resize((k_ + 1) * nodes_);
neighbors_ = neighbors_vec_.data();
// pre-allocate workspace for climbs, each block needs a separate permutation space and search
// buffer. We allocate a work buffer that will store the computed distances, px, py and the route.
// We align it on the warp size.
work_vec_.resize(sizeof(float) * restart_batch_ *
((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_));
work_ = work_vec_.data();

results_.best_x_pos = best_x_pos_vec_.data();
results_.best_y_pos = best_y_pos_vec_.data();
results_.best_route = best_route_vec_.data();
results_.best_cost = best_cost_scalar_.data();
}

// Pointers
neighbors_ = neighbors_vec_.data().get();
work_ = work_vec_.data().get();
void TSP::reset_batch()
{
mylock_scalar_.set_value_zero(stream_);
auto const max{std::numeric_limits<int>::max()};
best_cost_scalar_.set_value(max, stream_);
}

void TSP::get_initial_solution(int const batch)
{
if (!beam_search_) {
random_init<<<restart_batch_, best_thread_num_>>>(
work_, x_pos_, y_pos_, vtx_ptr_, nstart_, nodes_, batch, restart_batch_);
CHECK_CUDA(stream_);
} else {
knn_init<<<restart_batch_, best_thread_num_>>>(
work_, x_pos_, y_pos_, vtx_ptr_, neighbors_, nstart_, nodes_, k_, batch, restart_batch_);
CHECK_CUDA(stream_);
}
}

float TSP::compute()
{
float valid_coo_dist = 0.f;
float final_cost = 0.f;
int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
int restart_resid = restarts_ - (num_restart_batches - 1) * restart_batch_;
int global_best = INT_MAX;
float *soln = nullptr;
int *route_sol = nullptr;
int global_best = std::numeric_limits<int>::max();
int best = 0;

std::vector<float> h_x_pos;
std::vector<float> h_y_pos;
std::vector<int> h_route;
h_x_pos.reserve(nodes_ + 1);
h_y_pos.reserve(nodes_ + 1);

// Stats
int n_timers = 3;
long total_climbs = 0;
std::vector<float> h_times;
struct timeval starttime, endtime;

// KNN call
knn();
h_route.reserve(nodes_);
std::vector<float *> addr_best_x_pos(1);
std::vector<float *> addr_best_y_pos(1);
std::vector<int *> addr_best_route(1);
HighResTimer hr_timer;
auto create_timer = [&hr_timer, this](char const *name) {
return VerboseTimer(name, hr_timer, verbose_);
};

if (verbose_) {
std::cout << "Doing " << num_restart_batches - 1 << " batches of size " << restart_batch_
std::cout << "Doing " << num_restart_batches << " batches of size " << restart_batch_
<< ", with " << restart_resid << " tail\n";
std::cout << "configuration: " << nodes_ << " nodes, " << restarts_ << " restart\n";
std::cout << "optimizing graph with kswap = " << kswaps << "\n";
}

// Tell the cache how we want it to behave
cudaFuncSetCacheConfig(search_solution, cudaFuncCachePreferEqual);
best_thread_num_ = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);

int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n";
if (verbose_) std::cout << "Calculated best thread number = " << best_thread_num_ << "\n";

rmm::device_vector<float> times(n_timers * threads + n_timers);
h_times.reserve(n_timers * threads + n_timers);
if (beam_search_) {
auto timer = create_timer("knn");
knn();
}

gettimeofday(&starttime, NULL);
for (int b = 0; b < num_restart_batches; ++b) {
reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_);
CHECK_CUDA(stream_);
for (auto batch = 0; batch < num_restart_batches; ++batch) {
reset_batch();
if (batch == num_restart_batches - 1) restart_batch_ = restart_resid;

if (b == num_restart_batches - 1) restart_batch_ = restart_resid;

search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(mylock_,
best_tour_,
vtx_ptr_,
beam_search_,
k_,
nodes_,
neighbors_,
x_pos_,
y_pos_,
work_,
nstart_,
times.data().get(),
climbs_,
threads);
{
auto timer = create_timer("initial_sol");
get_initial_solution(batch);
}

CHECK_CUDA(stream_);
cudaDeviceSynchronize();
{
auto timer = create_timer("search_sol");
search_solution<<<restart_batch_,
best_thread_num_,
sizeof(int) * best_thread_num_,
stream_>>>(
results_, mylock_, vtx_ptr_, beam_search_, k_, nodes_, x_pos_, y_pos_, work_, nstart_);
CHECK_CUDA(stream_);
}

{
auto timer = create_timer("optimal_tour");
get_optimal_tour<<<restart_batch_,
best_thread_num_,
sizeof(int) * best_thread_num_,
stream_>>>(results_, mylock_, work_, nodes_);
CHECK_CUDA(stream_);
}

CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
best = best_cost_scalar_.value(stream_);

if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";

if (best < global_best) {
global_best = best;
CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *)));
cudaDeviceSynchronize();
CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *)));
cudaDeviceSynchronize();

raft::update_host(addr_best_x_pos.data(), results_.best_x_pos, 1, stream_);
raft::update_host(addr_best_y_pos.data(), results_.best_y_pos, 1, stream_);
raft::update_host(addr_best_route.data(), results_.best_route, 1, stream_);
CUDA_TRY(cudaStreamSynchronize(stream_));

raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_);
raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_);
raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_);
raft::copy(route_, addr_best_route[0], nodes_, stream_);
CHECK_CUDA(stream_);
}
total_climbs += climbs_scalar_.value(stream_);
}
gettimeofday(&endtime, NULL);
double runtime =
endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6;
long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2;

raft::copy(route_, route_sol, nodes_, stream_);

CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
CUDA_TRY(cudaMemcpy(
h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();

for (int i = 0; i < nodes_; ++i) {
if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
}

CUDA_TRY(cudaMemcpy(h_times.data(),
times.data().get(),
sizeof(float) * n_timers * threads + n_timers,
cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
for (auto i = 0; i < nodes_; ++i) {
if (verbose_) { std::cout << h_route[i] << ": " << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
}

if (verbose_) {
std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n";
hr_timer.display(std::cout);
std::cout << "Optimized tour length = " << global_best << "\n";
print_times(h_times, n_timers, handle_.get_device(), threads);
}

return valid_coo_dist;
return final_cost;
}

void TSP::knn()
Expand All @@ -192,17 +208,17 @@ void TSP::knn()
int dim = 2;
bool row_major_order = false;

rmm::device_vector<float> input(nodes_ * dim);
float *input_ptr = input.data().get();
rmm::device_uvector<float> input(nodes_ * dim, stream_);
float *input_ptr = input.data();
raft::copy(input_ptr, x_pos_, nodes_, stream_);
raft::copy(input_ptr + nodes_, y_pos_, nodes_, stream_);

rmm::device_vector<float> search_data(nodes_ * dim);
float *search_data_ptr = search_data.data().get();
rmm::device_uvector<float> search_data(nodes_ * dim, stream_);
float *search_data_ptr = search_data.data();
raft::copy(search_data_ptr, input_ptr, nodes_ * dim, stream_);

rmm::device_vector<float> distances(nodes_ * (k_ + 1));
float *distances_ptr = distances.data().get();
rmm::device_uvector<float> distances(nodes_ * (k_ + 1), stream_);
float *distances_ptr = distances.data();

std::vector<float *> input_vec;
std::vector<int> sizes_vec;
Expand All @@ -226,7 +242,7 @@ void TSP::knn()
}
} // namespace detail

float traveling_salesperson(raft::handle_t &handle,
float traveling_salesperson(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
Expand Down
Loading

0 comments on commit b85bd47

Please sign in to comment.