4 #include "detail/slab_node.h"
5 #include "detail/traverse.h"
7 #include <thrust/device_vector.h>
9 #include <cuda/std/utility>
17 template<
int block_size,
typename map_t,
typename K,
typename V>
18 __global__
void put_(map_t map, cuda::std::pair<K, V>* operations, V* output,
size_t size);
20 template<
int block_size,
typename map_t,
typename K,
typename V>
21 __global__
void put_(map_t map, K* operations_keys, V* operations_values, V* output,
size_t size);
23 template<
int block_size,
typename map_t,
typename K,
typename V>
24 __global__
void get_(map_t map, K* operations, cuda::std::pair<bool, V>* output,
size_t size);
26 template<
int block_size,
typename map_t,
typename K,
typename V>
27 __global__
void update_(map_t map, cuda::std::pair<K, V>* operations, cuda::std::pair<bool, V>* output,
size_t size);
32 template<
typename K,
typename V,
typename Allocator = device_allocator<detail::slab_node<K, V>>,
typename Hash = hash<K>>
38 LSLAB_HOST
map() :
map(10) {
42 LSLAB_HOST
map(
unsigned n_log_2) : number_of_buckets_log_2(n_log_2) {
43 size_t size = 1 << n_log_2;
44 cudaMalloc(&lock_table,
sizeof(
warp_mutex) * size);
46 cudaMemset(lock_table, 0,
sizeof(
warp_mutex) * size);
53 LSLAB_HOST
map(
unsigned n_log_2, Allocator&& a) : number_of_buckets_log_2(n_log_2), alloc(a) {
54 size_t size = 1 << n_log_2;
55 cudaMalloc(&lock_table,
sizeof(
warp_mutex) * size);
57 cudaMemset(lock_table, 0,
sizeof(
warp_mutex) * size);
72 LSLAB_HOST_DEVICE ~
map() {
76 LSLAB_DEVICE
void find_function(
const K& key, Fn&& fn,
bool thread_mask =
true) {
78 size_t hash = Hash{}(key);
79 hash &= ((1 << number_of_buckets_log_2) - 1);
84 LSLAB_DEVICE
bool get(
const K& key, V& value,
bool thread_mask =
true) {
86 LSLAB_DEVICE
void operator()(
const V& val) {
96 find_function(key, fn, thread_mask);
100 template<
typename Fn>
101 LSLAB_DEVICE
void insert_function(
const K& key, Fn&& fn,
bool thread_mask =
true) {
104 size_t hash = Hash{}(key) & ((1 << number_of_buckets_log_2) - 1);
105 t.template operator()<K, V, Fn>(lock_table, buckets_array, key, std::forward<Fn>(fn), alloc,
hash, thread_mask);
108 LSLAB_DEVICE V put(
const K& key,
const V& value,
bool thread_mask =
true) {
112 LSLAB_DEVICE
void operator()(V& val) {
122 insert_function(key, fn, thread_mask);
126 template<
typename Fn>
127 LSLAB_DEVICE
bool update_function(
const K& key, Fn&& fn,
bool thread_mask =
true) {
131 LSLAB_DEVICE cuda::std::pair<bool, V> update(
const K& key,
const V& value,
bool thread_mask =
true) {
133 LSLAB_DEVICE
void operator()(V& val) {
145 update_function(key, fn, thread_mask);
146 return {fn.found, fn.tmp};
149 template<
int block_size = 256>
150 LSLAB_HOST
void put(cuda::std::pair<K, V>* operations, V* output,
size_t size, cudaStream_t stream = 0x0) {
151 put_<block_size, this_t, K, V><<<(size + block_size - 1) / block_size, block_size, 0, stream>>>(*
this, operations, output, size);
154 template<
int block_size = 256>
155 LSLAB_HOST
void put(K* operations_keys, V* operations_values, V* output,
size_t size, cudaStream_t stream = 0x0) {
156 put_<block_size, this_t, K, V><<<(size + block_size - 1) / block_size, block_size, 0, stream>>>(*
this, operations_keys, operations_values, output, size);
160 template<
int block_size = 256>
161 LSLAB_HOST
void get(K* operations, cuda::std::pair<bool, V>* output,
size_t size, cudaStream_t stream = 0x0) {
162 get_<block_size, this_t, K, V><<<(size + block_size - 1) / block_size, block_size, 0, stream>>>(*
this, operations, output, size);
165 template<
int block_size = 256>
166 LSLAB_HOST
void update(cuda::std::pair<K, V>* operations, cuda::std::pair<bool, V>* output,
size_t size, cudaStream_t stream = 0x0) {
167 update_<block_size, this_t, K, V><<<(size + block_size - 1) / block_size, block_size, 0, stream>>>(*
this, operations, output, size);
170 LSLAB_HOST_DEVICE
unsigned buckets() {
171 return 1 << number_of_buckets_log_2;
177 unsigned number_of_buckets_log_2;
181 template<
int block_size,
typename map_t,
typename K,
typename V>
182 __global__
void put_(map_t
map, cuda::std::pair<K, V>* operations, V* output,
size_t size) {
184 int tidx = threadIdx.x;
185 int bidx = blockIdx.x;
189 if(tidx + bidx * block_size < size) {
190 key = operations[tidx + bidx * block_size].first;
191 val = operations[tidx + bidx * block_size].second;
194 V res = map.put(key, val, tidx + bidx * block_size < size);
196 if(tidx + bidx * block_size < size) {
197 output[tidx + bidx * block_size] = res;
201 template<
int block_size,
typename map_t,
typename K,
typename V>
202 __global__
void put_(map_t map, K* operations_keys, V* operations_values, V* output,
size_t size) {
204 int tidx = threadIdx.x;
205 int bidx = blockIdx.x;
209 if(tidx + bidx * block_size < size) {
210 key = operations_keys[tidx + bidx * block_size];
211 val = operations_values[tidx + bidx * block_size];
214 V res = map.put(key, val, tidx + bidx * block_size < size);
216 if(tidx + bidx * block_size < size) {
217 output[tidx + bidx * block_size] = res;
222 template<
int block_size,
typename map_t,
typename K,
typename V>
223 __global__
void get_(map_t map, K* operations, cuda::std::pair<bool, V>* output,
size_t size) {
225 int tidx = threadIdx.x;
226 int bidx = blockIdx.x;
230 if(tidx + bidx * block_size < size) {
231 key = operations[tidx + bidx * block_size];
234 bool res = map.get(key, value, tidx + bidx * block_size < size);
235 if(tidx + bidx * block_size < size) {
236 output[tidx + bidx * block_size] = {res, value};
241 template<
int block_size,
typename map_t,
typename K,
typename V>
242 __global__
void update_(map_t map, cuda::std::pair<K, V>* operations, cuda::std::pair<bool, V>* output,
size_t size) {
244 int tidx = threadIdx.x;
245 int bidx = blockIdx.x;
249 if(tidx + bidx * block_size < size) {
250 key = operations[tidx + bidx * block_size].first;
251 val = operations[tidx + bidx * block_size].second;
254 cuda::std::pair<bool, V> res = map.put(key, val, tidx + bidx * block_size < size);
255 if(tidx + bidx * block_size < size) {
256 output[tidx + bidx * block_size] = res;
lslab map for GPU
Definition: map.h:33
Definition: traverse.h:22
Definition: warp_mutex.h:13