5 жил өмнө · 0176979b4b
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,5 @@
 
				 .vs/
			
 
				 .vscode/
			
 
				 *.tar
			
 
				-__pycache__/
			
 
				-*.o
			
 
				+__pycache__
			
 
				+/build/
			
--- a/build/op_lib.so
+++ b/build/op_lib.so
--- a/configure
+++ b/configure
@@ -0,0 +1,12 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+BUILD_DIR=$1
			
 
				+if [ "" = "$BUILD_DIR" ]; then
			
 
				+    BUILD_DIR='./build'
			
 
				+fi
			
 
				+
			
 
				+mkdir -p $BUILD_DIR
			
 
				+
			
 
				+python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' > $BUILD_DIR/TF_CFLAGS
			
 
				+python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' > $BUILD_DIR/TF_LFLAGS
			
 
				+
			
--- a/examples/train.py
+++ b/examples/train.py
@@ -39,13 +39,27 @@ print(x_test.shape[0], 'test samples')
 
				 y_train = to_categorical(y_train, num_classes)
			
 
				 y_test = to_categorical(y_test, num_classes)
			
 
				 
			
 
				+a = layers.Input(shape=(28, 28, 1))
			
 
				+b = Conv2DFPGA(2)(a)
			
 
				+c = Conv2DFPGA(2)(a)
			
 
				+d = Conv2DFPGA(2)(a)
			
 
				+e = Conv2DFPGA(2)(a)
			
 
				+
			
 
				+x = layers.Add()([b,c,d,e])
			
 
				+y = layers.Flatten()(x)
			
 
				+z = layers.Dense(num_classes, activation='softmax')(y)
			
 
				+
			
 
				+model = Model(inputs=a, outputs=z)
			
 
				+"""
			
 
				 model = Sequential()
			
 
				 model.add(Conv2DFPGA([0,0]))
			
 
				+model.add(Conv2DFPGA([0,0]))
			
 
				+model.add(Conv2DFPGA([0,0]))
			
 
				 model.add(Flatten())
			
 
				 model.add(Dense(128, activation='relu'))
			
 
				 model.add(Dropout(0.5))
			
 
				 model.add(Dense(num_classes, activation='softmax'))
			
 
				-
			
 
				+"""
			
 
				 model.compile(loss=keras.losses.categorical_crossentropy,
			
 
				               optimizer=keras.optimizers.Adadelta(),
			
 
				               metrics=['accuracy'])
			
--- a/layers/conv2D.py
+++ b/layers/conv2D.py
@@ -1,13 +1,54 @@
 
				 import tensorflow as tf
			
 
				-from tensorflow.keras import layers
			
 
				+from tensorflow.python.framework import tensor_shape
			
 
				+from tensorflow.keras import layers, initializers, regularizers, constraints
			
 
				 
			
 
				 from .. import load_op
			
 
				 
			
 
				 class Conv2D(layers.Layer):
			
 
				-  def __init__(self, kernel):
			
 
				+  def __init__(self,
			
 
				+    filters = 1,
			
 
				+    kernel_initializer = 'glorot_uniform',
			
 
				+               kernel_regularizer=None,
			
 
				+               kernel_constraint=None,
			
 
				+    ):
			
 
				     super(Conv2D, self).__init__()
			
 
				-    self.kernel = kernel
			
 
				+    #int, dim of output space
			
 
				+    self.filters = filters
			
 
				+    self.kernel_initializer = initializers.get(kernel_initializer)
			
 
				+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
			
 
				+    self.kernel_constraint = constraints.get(kernel_constraint)
			
 
				+
			
 
				+
			
 
				+  def build(self, input_shape):
			
 
				+    input_shape = tf.TensorShape(input_shape)
			
 
				+    self.input_channel = input_shape[3]
			
 
				+    kernel_shape = (5,)*2 + (self.input_channel, self.filters)
			
 
				+
			
 
				+    self.kernel = self.add_weight(
			
 
				+        name='kernel',
			
 
				+        shape=kernel_shape,
			
 
				+        initializer=self.kernel_initializer,
			
 
				+        regularizer=self.kernel_regularizer,
			
 
				+        constraint=self.kernel_constraint,
			
 
				+        trainable=True,
			
 
				+        dtype=self.dtype)
			
 
				+
			
 
				   def call(self, inputs):
			
 
				-    ints = tf.dtypes.cast(inputs, dtype=tf.int32)
			
 
				-    outs = load_op.op_lib.MyConv2D(input=ints, filter=ints)
			
 
				+
			
 
				+    #out = tf.Tensor(tf.int32, shape=inputs.shape)
			
 
				+
			
 
				+    ch_inputs = tf.unstack(tf.dtypes.cast(inputs, dtype=tf.int32), axis=3)
			
 
				+    ch_kernel = tf.unstack(tf.dtypes.cast(self.kernel, dtype=tf.int32), axis=2)
			
 
				+
			
 
				+    ch_outputs = [None] * len(ch_inputs)
			
 
				+
			
 
				+    for ch in range(len(ch_inputs)):
			
 
				+      print(ch_inputs[ch], ch_kernel[ch])
			
 
				+      ch_outputs[ch] = [None] * self.filters
			
 
				+      kernel_2d = tf.unstack(ch_kernel[ch], axis=2)
			
 
				+      for f in range(len(kernel_2d)):
			
 
				+        ch_outputs[ch][f] = load_op.op_lib.MyConv2D(input=ch_inputs[ch], filter=kernel_2d[f])
			
 
				+      
			
 
				+      ch_outputs[ch] = tf.stack(ch_outputs[ch], axis=2)
			
 
				+    outs = tf.stack(ch_outputs, axis=2)
			
 
				     return tf.dtypes.cast(outs, dtype=tf.float32)
			
--- a/makefile
+++ b/makefile
@@ -3,28 +3,30 @@ CXX=/usr/bin/g++
 
				 CFLAGS=-g -Wall -pthread -std=c++11
			
 
				 LFLAGS=-shared -Wl,--no-as-needed
			
 
				 
			
 
				-TF_CFLAGS=$(shell python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2>/dev/null)
			
 
				-TF_LFLAGS=$(shell python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null)
			
 
				-
			
 
				 SRC_DIR=./src
			
 
				 INC_DIR=./src
			
 
				 BUILD_DIR=./build
			
 
				 
			
 
				+TF_CFLAGS=$(shell cat $(BUILD_DIR)/TF_CFLAGS)
			
 
				+TF_LFLAGS=$(shell cat $(BUILD_DIR)/TF_LFLAGS)
			
 
				+
			
 
				 SRCS=$(wildcard $(SRC_DIR)/*.cpp)
			
 
				 OBJS=$(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SRCS))
			
 
				 
			
 
				 EXECUTABLE=op_lib.so
			
 
				 
			
 
				-all: dir $(BUILD_DIR)/$(EXECUTABLE)
			
 
				+all: config $(BUILD_DIR)/$(EXECUTABLE)
			
 
				 
			
 
				-dir:
			
 
				-	mkdir -p $(BUILD_DIR)
			
 
				+config:
			
 
				+	@if [ ! -d "$(BUILD_DIR)" ]; then ./configure $(BUILD_DIR) || exit 1; fi
			
 
				 
			
 
				 $(BUILD_DIR)/$(EXECUTABLE): $(OBJS)
			
 
				 	$(CXX) $(LFLAGS) $(TF_LFLAGS) -o $@ $^
			
 
				 
			
 
				 $(OBJS): $(BUILD_DIR)/%.o : $(SRC_DIR)/%.cpp $(INC_DIR)/%.hpp
			
 
				-	$(CXX) $(CFLAGS) -fPIC -c $(TF_CFLAGS) -I$(INC_DIR) -o $@ $< -O2
			
 
				+	$(CXX) $(CFLAGS) -fPIC -c $(TF_CFLAGS) -I$(INC_DIR) -o $@ $<
			
 
				+
			
 
				+tf_cflags:
			
 
				 
			
 
				 clean:
			
 
				 	rm -f $(BUILD_DIR)/*.o $(BUILD_DIR)/$(EXECUTABLE)
			
--- a/src/conv2D.cpp
+++ b/src/conv2D.cpp
@@ -3,7 +3,23 @@
 
				 
			
 
				 #include "conv2D.hpp"
			
 
				 
			
 
				-void Conv2DOp::Compute(OpKernelContext* context) {
			
 
				+volatile int instances = 0;
			
 
				+pthread_t tDelay;
			
 
				+pthread_attr_t attr;
			
 
				+typedef void (*fptr)();
			
 
				+void *delayThread(void *ref) {
			
 
				+  sleep(1);
			
 
				+  fptr done = reinterpret_cast<fptr>(ref);
			
 
				+  printf("cb!\n");
			
 
				+  done();
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+Conv2DOp::Conv2DOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
			
 
				+  instance = instances++;
			
 
				+};
			
 
				+
			
 
				+void Conv2DOp::ComputeAsync(OpKernelContext* context, DoneCallback done) {
			
 
				   // Input tensor is of the following dimensions:
			
 
				   // [ batch, in_rows, in_cols, in_depth ]
			
 
				   const Tensor& input = context->input(0);
			
@@ -11,6 +27,15 @@ void Conv2DOp::Compute(OpKernelContext* context) {
 
				   // Input filter is of the following dimensions:
			
 
				   // [ filter_rows, filter_cols, in_depth, out_depth]
			
 
				   const Tensor& filter = context->input(1);
			
 
				+  TensorShape filterShape = filter.shape();
			
 
				+
			
 
				+
			
 
				+  printf("\ninstance: %d shape: ", instance);
			
 
				+  for(int i=0; i<filterShape.dims(); i++) {
			
 
				+    printf(" %lld", filter.shape().dim_size(i));
			
 
				+  }
			
 
				+  printf("\n");
			
 
				+  sleep(1);
			
 
				 
			
 
				   TensorShape out_shape = input.shape();
			
 
				 
			
@@ -19,12 +44,7 @@ void Conv2DOp::Compute(OpKernelContext* context) {
 
				   Tensor* output = nullptr;
			
 
				   OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
			
 
				 
			
 
				-  std::cout << "Conv2D" << std::endl;
			
 
				-
			
 
				-  // If there is nothing to compute, return.
			
 
				-  if (out_shape.num_elements() == 0) {
			
 
				-    return;
			
 
				-  }
			
 
				+  pthread_create(&tDelay, &attr, delayThread, static_cast<void*>(&done));
			
 
				 
			
 
				   
			
 
				 }
			
--- a/src/conv2D.hpp
+++ b/src/conv2D.hpp
@@ -1,15 +1,20 @@
 
				 #include "tensorflow/core/framework/op_kernel.h"
			
 
				 #include "tensorflow/core/framework/function.h"
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include <pthread.h>
			
 
				 
			
 
				 using namespace tensorflow;
			
 
				 typedef FunctionDefHelper FDH;
			
 
				 
			
 
				 
			
 
				-class Conv2DOp : public OpKernel {
			
 
				- public:
			
 
				-  explicit Conv2DOp(OpKernelConstruction* context) : OpKernel(context) {};
			
 
				+class Conv2DOp : public AsyncOpKernel {
			
 
				+  public:
			
 
				+    explicit Conv2DOp(OpKernelConstruction* context);
			
 
				 
			
 
				-  void Compute(OpKernelContext* context) override;
			
 
				+    void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
			
 
				 
			
 
				+  private:
			
 
				+    int instance = -1;
			
 
				   //TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
			
 
				 };