Intel HEXL for FPGA
Intel Homomorphic Encryption FPGA Acceleration Library, accelerating the modular arithmetic operations used in homomorphic encryption.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fpga.h
Go to the documentation of this file.
1 // Copyright (C) 2020-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #ifndef __FPGA_H__
5 #define __FPGA_H__
6 
7 #include <atomic>
8 #include <condition_variable>
9 #include <deque>
10 #include <future>
11 #include <memory>
12 #include <mutex>
13 #include <thread>
14 #include <unordered_map>
15 #include <vector>
16 
17 #include "CL/opencl.h"
18 
19 namespace intel {
20 namespace hexl {
21 namespace fpga {
22 
23 __extension__ typedef unsigned __int128 fpga_uint128_t;
30 typedef struct {
31  uint64_t modulus;
32  uint64_t len;
33  uint64_t barr_lo;
35 
43 
44 struct Object {
45 public:
46  Object();
47  virtual ~Object() = default;
48 
49  bool ready_;
50  int id_;
51  static unsigned int g_wid_;
52 };
53 
64 struct Object_NTT : public Object {
65  explicit Object_NTT(uint64_t* coeff_poly,
66  const uint64_t* root_of_unity_powers,
67  const uint64_t* precon_root_of_unity_powers,
68  uint64_t coeff_modulus, uint64_t n);
69 
70  uint64_t* coeff_poly_;
71  const uint64_t* root_of_unity_powers_;
73  uint64_t coeff_modulus_;
74  uint64_t n_;
75 };
76 
91 struct Object_INTT : public Object {
92  explicit Object_INTT(uint64_t* coeff_poly,
93  const uint64_t* inv_root_of_unity_powers,
94  const uint64_t* precon_inv_root_of_unity_powers,
95  uint64_t coeff_modulus, uint64_t inv_n,
96  uint64_t inv_n_w, uint64_t n);
97 
98  uint64_t* coeff_poly_;
99  const uint64_t* inv_root_of_unity_powers_;
101  uint64_t coeff_modulus_;
102  uint64_t inv_n_;
103  uint64_t inv_n_w_;
104  uint64_t n_;
105 };
116 struct Object_DyadicMultiply : public Object {
117  explicit Object_DyadicMultiply(uint64_t* results, const uint64_t* operand1,
118  const uint64_t* operand2, uint64_t n,
119  const uint64_t* moduli, uint64_t n_moduli);
120 
121  uint64_t* results_;
122  const uint64_t* operand1_;
123  const uint64_t* operand2_;
124  uint64_t n_;
125  const uint64_t* moduli_;
126  uint64_t n_moduli_;
127 };
155 class Buffer {
156 public:
157  Buffer(uint64_t capacity, uint64_t n_batch_dyadic_multiply,
158  uint64_t n_batch_ntt, uint64_t n_batch_intt)
159  : capacity_(capacity),
160  n_batch_dyadic_multiply_(n_batch_dyadic_multiply),
161  n_batch_ntt_(n_batch_ntt),
162  n_batch_intt_(n_batch_intt),
163  total_worksize_DyadicMultiply_(1),
164  num_DyadicMultiply_(0),
165  total_worksize_NTT_(1),
166  num_NTT_(0),
167  total_worksize_INTT_(1),
168  num_INTT_(0) {}
169 
170  void push(Object* obj);
171  Object* front();
172  std::vector<Object*> pop();
173 
174  uint64_t size();
175 
176  uint64_t get_worksize_DyadicMultiply() const {
177  return total_worksize_DyadicMultiply_;
178  }
179  uint64_t get_worksize_NTT() const { return total_worksize_NTT_; }
180  uint64_t get_worksize_INTT() const { return total_worksize_INTT_; }
181 
182  void set_worksize_DyadicMultiply(uint64_t ws) {
183  total_worksize_DyadicMultiply_ = ws;
184  num_DyadicMultiply_ = total_worksize_DyadicMultiply_;
185  }
186  void set_worksize_NTT(uint64_t ws) {
187  total_worksize_NTT_ = ws;
188  num_NTT_ = total_worksize_NTT_;
189  }
190  void set_worksize_INTT(uint64_t ws) {
191  total_worksize_INTT_ = ws;
192  num_INTT_ = total_worksize_INTT_;
193  }
194 
195 private:
196  uint64_t get_worksize_int_DyadicMultiply() const {
197  return ((num_DyadicMultiply_ > n_batch_dyadic_multiply_)
198  ? n_batch_dyadic_multiply_
199  : num_DyadicMultiply_);
200  }
201 
202  uint64_t get_worksize_int_NTT() const {
203  return ((num_NTT_ > n_batch_ntt_) ? n_batch_ntt_ : num_NTT_);
204  }
205 
206  uint64_t get_worksize_int_INTT() const {
207  return ((num_INTT_ > n_batch_intt_) ? n_batch_intt_ : num_INTT_);
208  }
209 
210  void update_work_size(uint64_t ws) { num_DyadicMultiply_ -= ws; }
211  void update_DyadicMultiply_work_size(uint64_t ws) {
212  num_DyadicMultiply_ -= ws;
213  }
214  void update_NTT_work_size(uint64_t ws) { num_NTT_ -= ws; }
215  void update_INTT_work_size(uint64_t ws) { num_INTT_ -= ws; }
216 
217  std::mutex mu_;
218  std::mutex mu_size_;
219  std::condition_variable cond_;
220  std::deque<Object*> buffer_;
221  const uint64_t capacity_;
222  const uint64_t n_batch_dyadic_multiply_;
223  const uint64_t n_batch_ntt_;
224  const uint64_t n_batch_intt_;
225 
226  uint64_t total_worksize_DyadicMultiply_;
227  uint64_t num_DyadicMultiply_;
228 
229  uint64_t total_worksize_NTT_;
230  uint64_t num_NTT_;
231 
232  uint64_t total_worksize_INTT_;
233  uint64_t num_INTT_;
234 };
250 struct FPGAObject {
251  FPGAObject(const cl_context& context, uint64_t n_batch);
252  virtual ~FPGAObject() = default;
253  virtual void fill_in_data(const std::vector<Object*>& objs) = 0;
254  virtual void fill_out_data(uint64_t* results) = 0;
255 
256  void recycle();
257 
258  const cl_context& context_;
259  int tag_;
260  uint64_t n_batch_;
261 
262  std::vector<Object*> in_objs_;
263 
264  static std::atomic<int> g_tag_;
265 };
266 
282 struct FPGAObject_NTT : public FPGAObject {
283  explicit FPGAObject_NTT(const cl_context& context, uint64_t coeff_count,
284  uint64_t batch_size);
285  ~FPGAObject_NTT();
286  void fill_in_data(const std::vector<Object*>& objs) override;
287  void fill_out_data(uint64_t* coeff_poly) override;
288 
293  uint64_t n_;
294 };
295 
313 struct FPGAObject_INTT : public FPGAObject {
314  explicit FPGAObject_INTT(const cl_context& context, uint64_t coeff_count,
315  uint64_t batch_size);
317  void fill_in_data(const std::vector<Object*>& objs) override;
318  void fill_out_data(uint64_t* coeff_poly) override;
319 
324  uint64_t* inv_n_in_svm_;
325  uint64_t* inv_n_w_in_svm_;
326  uint64_t n_;
327 };
328 
347  explicit FPGAObject_DyadicMultiply(const cl_context& context,
348  uint64_t coeff_size,
349  uint32_t modulus_size,
350  uint64_t batch_size);
352  void fill_in_data(const std::vector<Object*>& objs) override;
353  void fill_out_data(uint64_t* results) override;
354 
355  uint64_t* operand1_in_svm_;
356  uint64_t* operand2_in_svm_;
358  uint64_t n_;
359  uint64_t n_moduli_;
362 };
367 typedef enum { NONE = 0, EMU, FPGA } DEV_TYPE;
385 class Device {
386 public:
387  Device(const cl_device_id& device, Buffer& buffer,
388  std::shared_future<bool> exit_signal, uint64_t coeff_size,
389  uint32_t modulus_size, uint64_t batch_size_dyadic_multiply,
390  uint64_t batch_size_ntt, uint64_t batch_size_intt, uint32_t debug);
391  ~Device();
392 
393  void run();
394 
395 private:
396  enum { CREDIT = 8 };
397 
398  enum class kernel_t { INTEGRATED, DYADIC_MULTIPLY, NTT, INTT };
399  void process_blocking_api();
400  bool process_input(int index);
401  bool process_output();
402 
403  bool process_output_dyadic_multiply();
404  bool process_output_NTT();
405  bool process_output_INTT();
406 
407  void enqueue_input_data(FPGAObject* fpga_obj);
408  void enqueue_input_data_dyadic_multiply(
409  FPGAObject_DyadicMultiply* fpga_obj);
410  void enqueue_input_data_NTT(FPGAObject_NTT* fpga_obj);
411  void enqueue_input_data_INTT(FPGAObject_INTT* fpga_obj);
412 
413  int device_id() { return id_; }
414 
415  kernel_t get_kernel_type();
416  std::string get_bitstream_name();
417 
418  const cl_device_id& device_;
419  Buffer& buffer_;
420  unsigned int credit_;
421  std::shared_future<bool> future_exit_;
422  int id_;
423  static int device_id_;
424  kernel_t kernel_type_;
425 
426  std::vector<FPGAObject*> fpgaObjects_;
427 
428  cl_context context_;
429  cl_program program_;
430 
431  // DYADIC_MULTIPLY section
432  cl_command_queue dyadic_multiply_input_queue_;
433  cl_command_queue dyadic_multiply_output_queue_;
434  cl_kernel dyadic_multiply_input_fifo_kernel_;
435  cl_kernel dyadic_multiply_output_fifo_nb_kernel_;
436 
437  uint64_t* dyadic_multiply_results_out_svm_;
438  int* dyadic_multiply_tag_out_svm_;
439  int* dyadic_multiply_results_out_valid_svm_;
440  //
441 
442  // NTT section
443  cl_command_queue ntt_load_queue_;
444  cl_command_queue ntt_store_queue_;
445  cl_kernel ntt_load_kernel_;
446  cl_kernel ntt_store_kernel_;
447 
448  uint64_t* NTT_coeff_poly_svm_;
449 
450  // INTT section
451  cl_command_queue intt_INTT_queue_;
452  cl_command_queue intt_load_queue_;
453  cl_command_queue intt_store_queue_;
454  cl_kernel intt_INTT_kernel_;
455  cl_kernel intt_load_kernel_;
456  cl_kernel intt_store_kernel_;
457 
458  uint64_t* INTT_coeff_poly_svm_;
459  //
460 
461  uint32_t debug_;
462 
463  static const std::unordered_map<std::string, kernel_t> kernels;
464 };
480 class DevicePool {
481 public:
482  DevicePool(int choice, Buffer& buffer, std::future<bool>& exit_signal,
483  uint64_t coeff_size, uint32_t modulus_size,
484  uint64_t batch_size_dyadic_multiply, uint64_t batch_size_ntt,
485  uint64_t batch_size_intt, uint32_t debug);
486  ~DevicePool();
487 
488 private:
489  DevicePool(const DevicePool& d) = delete;
490  DevicePool& operator=(const DevicePool& d) = delete;
491 
492  cl_platform_id platform_;
493  cl_uint device_count_;
494  cl_device_id* cl_devices_;
495  Device** devices_;
496  std::shared_future<bool> future_exit_;
497 
498  std::vector<std::thread> runners_;
499 };
504 void attach_fpga_pooling();
509 void detach_fpga_pooling();
510 
511 } // namespace fpga
512 } // namespace hexl
513 } // namespace intel
514 
515 #endif
uint64_t n_moduli_
Definition: fpga.h:126
uint64_t n_
Definition: fpga.h:358
uint64_t barr_lo
Definition: fpga.h:33
Definition: fpga.h:367
uint64_t n_
Definition: fpga.h:124
virtual ~FPGAObject()=default
Struct FPGAObject_NTT stores the NTT blob of objects to be transfered to the FPGA.
Definition: fpga.h:282
uint64_t len
Definition: fpga.h:32
const uint64_t * inv_root_of_unity_powers_
Definition: fpga.h:99
Definition: fpga.h:367
Object_DyadicMultiply(uint64_t *results, const uint64_t *operand1, const uint64_t *operand2, uint64_t n, const uint64_t *moduli, uint64_t n_moduli)
Parent Struct FPGAObject stores the blob of objects to be transfered to the FPGA. ...
Definition: fpga.h:250
uint64_t n_moduli_
Definition: fpga.h:359
uint64_t * coeff_modulus_in_svm_
Definition: fpga.h:292
const uint64_t * operand1_
Definition: fpga.h:122
struct Object_DyadicMultiply Stores the parameters for the multiplication
Definition: fpga.h:116
Class Device.
Definition: fpga.h:385
FPGAObject_INTT(const cl_context &context, uint64_t coeff_count, uint64_t batch_size)
FPGAObject(const cl_context &context, uint64_t n_batch)
uint64_t * inv_n_in_svm_
Definition: fpga.h:324
void attach_fpga_pooling()
attach_fpga_pooling Attach a device to this thread
virtual void fill_in_data(const std::vector< Object * > &objs)=0
uint64_t * coeff_poly_
Definition: fpga.h:98
void fill_in_data(const std::vector< Object * > &objs) override
Struct Object.
Definition: fpga.h:44
Struct Buffer Structure containing information for the polynomial operations.
Definition: fpga.h:155
DEV_TYPE
enum DEV_TYPE Lists the available device mode: CPU, emulation mode, FPGA
Definition: fpga.h:367
static unsigned int g_wid_
Definition: fpga.h:51
virtual ~Object()=default
const uint64_t * precon_root_of_unity_powers_
Definition: fpga.h:72
Device(const cl_device_id &device, Buffer &buffer, std::shared_future< bool > exit_signal, uint64_t coeff_size, uint32_t modulus_size, uint64_t batch_size_dyadic_multiply, uint64_t batch_size_ntt, uint64_t batch_size_intt, uint32_t debug)
uint64_t n_
Definition: fpga.h:326
bool ready_
Definition: fpga.h:49
uint64_t * results_
Definition: fpga.h:121
uint64_t * coeff_modulus_in_svm_
Definition: fpga.h:323
Object_INTT(uint64_t *coeff_poly, const uint64_t *inv_root_of_unity_powers, const uint64_t *precon_inv_root_of_unity_powers, uint64_t coeff_modulus, uint64_t inv_n, uint64_t inv_n_w, uint64_t n)
uint64_t * operand2_in_svm_
Definition: fpga.h:356
void set_worksize_INTT(uint64_t ws)
Definition: fpga.h:190
Struct FPGAObject_DyadicMultiply Stores the multiplication blob of objects to be transfered to the FP...
Definition: fpga.h:346
void fill_out_data(uint64_t *coeff_poly) override
uint64_t * operand1_in_svm_
Definition: fpga.h:355
uint64_t get_worksize_NTT() const
Definition: fpga.h:179
Struct moduli_info_t.
Definition: fpga.h:30
uint64_t coeff_modulus_
Definition: fpga.h:73
int tag_
Definition: fpga.h:259
uint64_t modulus
Definition: fpga.h:31
cl_mem results_out_ddr_
Definition: fpga.h:361
void fill_in_data(const std::vector< Object * > &objs) override
moduli_info_t * moduli_info_
Definition: fpga.h:357
uint64_t n_
Definition: fpga.h:74
uint64_t * inv_root_of_unity_powers_in_svm_
Definition: fpga.h:321
int id_
Definition: fpga.h:50
uint64_t coeff_modulus_
Definition: fpga.h:101
uint64_t get_worksize_INTT() const
Definition: fpga.h:180
uint64_t inv_n_
Definition: fpga.h:102
void set_worksize_DyadicMultiply(uint64_t ws)
Definition: fpga.h:182
uint64_t n_
Definition: fpga.h:293
Struct Object NTT Stores the Number Theoretic Transform parameters.
Definition: fpga.h:64
uint64_t n_batch_
Definition: fpga.h:260
uint64_t * root_of_unity_powers_in_svm_
Definition: fpga.h:290
uint64_t * coeff_poly_in_svm_
Definition: fpga.h:320
std::vector< Object * > pop()
uint64_t inv_n_w_
Definition: fpga.h:103
static std::atomic< int > g_tag_
Definition: fpga.h:264
uint64_t * coeff_poly_in_svm_
Definition: fpga.h:289
void fill_out_data(uint64_t *coeff_poly) override
const uint64_t * operand2_
Definition: fpga.h:123
Object_NTT(uint64_t *coeff_poly, const uint64_t *root_of_unity_powers, const uint64_t *precon_root_of_unity_powers, uint64_t coeff_modulus, uint64_t n)
uint64_t * precon_root_of_unity_powers_in_svm_
Definition: fpga.h:291
cl_mem operands_in_ddr_
Definition: fpga.h:360
const cl_context & context_
Definition: fpga.h:258
void push(Object *obj)
Definition: fpga.h:367
uint64_t n_
Definition: fpga.h:104
uint64_t * coeff_poly_
Definition: fpga.h:70
void fill_out_data(uint64_t *results) override
const uint64_t * moduli_
Definition: fpga.h:125
DevicePool(int choice, Buffer &buffer, std::future< bool > &exit_signal, uint64_t coeff_size, uint32_t modulus_size, uint64_t batch_size_dyadic_multiply, uint64_t batch_size_ntt, uint64_t batch_size_intt, uint32_t debug)
Class DevicePool.
Definition: fpga.h:480
const uint64_t * precon_inv_root_of_unity_powers_
Definition: fpga.h:100
void detach_fpga_pooling()
detach_fpga_pooling Detach a device from this thread
Struct Object INTT Stores the Inverse Number Theoretic Transform parameters.
Definition: fpga.h:91
FPGAObject_DyadicMultiply(const cl_context &context, uint64_t coeff_size, uint32_t modulus_size, uint64_t batch_size)
uint64_t * inv_n_w_in_svm_
Definition: fpga.h:325
Struct FPGAObject_INTT stores the INTT blob of objects to be transfered to the FPGA.
Definition: fpga.h:313
void set_worksize_NTT(uint64_t ws)
Definition: fpga.h:186
virtual void fill_out_data(uint64_t *results)=0
const uint64_t * root_of_unity_powers_
Definition: fpga.h:71
void fill_in_data(const std::vector< Object * > &objs) override
Buffer(uint64_t capacity, uint64_t n_batch_dyadic_multiply, uint64_t n_batch_ntt, uint64_t n_batch_intt)
Definition: fpga.h:157
uint64_t get_worksize_DyadicMultiply() const
Definition: fpga.h:176
__extension__ typedef unsigned __int128 fpga_uint128_t
Definition: fpga.h:23
uint64_t * precon_inv_root_of_unity_powers_in_svm_
Definition: fpga.h:322
std::vector< Object * > in_objs_
Definition: fpga.h:262
FPGAObject_NTT(const cl_context &context, uint64_t coeff_count, uint64_t batch_size)