Merge pull request #89 from SiliconJackets/main

SiliconJackets Submission to Code-a-Chip
sscs-ose · Apr 18, 2024 · a165b1f · a165b1f
2 parents 6d697ac + d6db107
commit a165b1f
Show file tree

Hide file tree

Showing 16 changed files with 2,761 additions and 0 deletions.
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/README.md b/VLSI24/submitted_notebooks/SJSystolicArray/README.md
@@ -0,0 +1,2 @@
+# SiliconJackets Systolic Array
+This notebook goes through the process of design specification, simulation, and implementation of a Systolic Array with open-source tools and PDKs. The parallel computation and data reuse ability of a systolic array is crucial for the acceleration of neural networks, and this notebook with the reusable design aims to contribute to the hardware open-source community to enable more efficient ML applications. This project will explain the principles behind how a systolic array operates 2D convolution, demonstrate the performance of our implementation with image results, and show the final GDS generated with open-source flow. Additionally, to further demonstrate the feasibility of the open-source flow and our design, we are also submitting this systolic array design to the open-source silicon initiative, [Tiny Tapeout](https://tinytapeout.com/). This submission is completed by members of SiliconJackets. We are a student run organization at Georgia Tech that introduces students to semiconductor design, verification, and implementation through a large collaborative project. We are hoping to use this notebook as an example for future members of the club.
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/SystolicArray.ipynb b/VLSI24/submitted_notebooks/SJSystolicArray/SystolicArray.ipynb
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/img/Ctrl.png b/VLSI24/submitted_notebooks/SJSystolicArray/img/Ctrl.png
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/img/PE.png b/VLSI24/submitted_notebooks/SJSystolicArray/img/PE.png
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/img/Top.png b/VLSI24/submitted_notebooks/SJSystolicArray/img/Top.png
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/img/systolic_array_flow.gif b/VLSI24/submitted_notebooks/SJSystolicArray/img/systolic_array_flow.gif
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/img/systolicarray.jpg b/VLSI24/submitted_notebooks/SJSystolicArray/img/systolicarray.jpg
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/img/systolicarray.png b/VLSI24/submitted_notebooks/SJSystolicArray/img/systolicarray.png
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/src/PE.sv b/VLSI24/submitted_notebooks/SJSystolicArray/src/PE.sv
@@ -0,0 +1,109 @@
+module PE
+    (
+        input logic clk_i, rstn_i,
+        input logic [9:0] psum_i,
+        input logic [7:0] filter_i, 
+        input logic [7:0] ifmap_i, 
+        input logic read_new_filter_val,
+        input logic read_new_ifmap_val,
+        input logic start_conv,
+        output logic [9:0] psum_o, 
+        output logic psum_valid_o
+    );
+
+    //Scratchpad regs
+    logic signed [7:0] filter_spad [0:2];
+    logic signed [7:0] ifmap_spad [0:2];
+    logic signed [9:0] psum_spad;
+
+    //psum buffer reg
+    logic signed [9:0] psum_buffer;
+
+    //datapath wires
+    // logic signed [DATA_SIZE-1:0] mult_input_filter, mult_input_ifmap; //wires between regs and multiplier
+    logic signed [15:0] mult_out_raw; //full multiplication result
+    logic signed [9:0] mult_out_trunc;
+    logic signed [9:0] adder_input, adder_output, psum_spad_input; // result of multiplexor. chooses either result of MAC or the psum from above PE to go to adder
+
+    //counter reg and wires
+    logic [1:0] counter; //Tells which regs to use in scratchpad
+    logic [1:0] next_counter; // 1 + index
+    logic acc_psum;
+
+    //state reg and wire
+    logic next_calculating;
+    logic calculating;
+
+
+    always_comb begin
+        //============= Time to accumulate psum? ===============
+        acc_psum = (counter == 2'd3);
+
+        //============= Next State ==================
+        if ((!calculating && start_conv) || (calculating && !acc_psum)) next_calculating = '1;
+        else next_calculating = '0;
+
+        //============= Next Counter =================
+        next_counter = calculating ? counter + 1 : '0;
+
+        //============= Multiplication ===============
+        mult_out_raw = filter_spad[counter] * ifmap_spad[counter];
+        mult_out_trunc = mult_out_raw[15:6]; //truncate to 10 bits
+
+        //============= Accumulation ================
+        adder_input = acc_psum ? psum_i : mult_out_trunc;
+        adder_output = adder_input + psum_spad;
+        psum_spad_input = (calculating && !acc_psum) ? adder_output : '0;
+
+        //============= Set Output =================
+        psum_o = psum_buffer;
+    end
+
+    always_ff @(posedge clk_i, negedge rstn_i) begin
+        if (!rstn_i) begin
+            //============ set all the registers to 0 =========
+            counter <= '0;
+            for (int i = 0; i < 3; i++) begin
+                filter_spad[i] <= '0;
+                ifmap_spad[i] <= '0;
+            end
+            psum_spad <= '0;
+            psum_buffer <= '0;
+            calculating <= '0;
+            psum_valid_o <= '0;
+
+        end else begin
+            //==========   update state ===========
+            calculating <= next_calculating;
+
+            //==========  update counter  =============
+            counter <= next_counter;
+
+            //==========  update filter scratchpad  =============
+            if (read_new_filter_val) begin
+                for (int i = 0; i < 2; i++) begin
+                    filter_spad[i] <= filter_spad[i+1];
+                end
+                filter_spad[2] <= filter_i;
+            end
+
+            //==========  update ifmap scratchpad  =============
+            if (read_new_ifmap_val) begin
+                for (int i = 0; i < 2; i++) begin
+                    ifmap_spad[i] <= ifmap_spad[i+1];
+                end
+                ifmap_spad[2] <= ifmap_i;
+            end
+
+            //========= update psum buffer ==========
+            if (acc_psum) psum_buffer <= adder_output;
+
+            //========= update psum scratchpad ======
+            psum_spad <= psum_spad_input;
+
+            //============= valid bit ===================
+            psum_valid_o <= acc_psum;
+        end
+    end
+
+endmodule
diff --git a/VLSI24/submitted_notebooks/SJSystolicArray/src/python/canny.py b/VLSI24/submitted_notebooks/SJSystolicArray/src/python/canny.py
@@ -0,0 +1,220 @@
+'''
+Reference:
+https://towardsdatascience.com/implement-canny-edge-detection-from-scratch-with-pytorch-a1cccfa58bed
+'''
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms 
+import cv2
+
+def get_gaussian_kernel(k=3, mu=0, sigma=1, normalize=True):
+    # compute 1 dimension gaussian
+    gaussian_1D = np.linspace(-1, 1, k)
+    # compute a grid distance from center
+    x, y = np.meshgrid(gaussian_1D, gaussian_1D)
+    distance = (x ** 2 + y ** 2) ** 0.5
+
+    # compute the 2 dimension gaussian
+    gaussian_2D = np.exp(-(distance - mu) ** 2 / (2 * sigma ** 2))
+    gaussian_2D = gaussian_2D / (2 * np.pi *sigma **2)
+
+    # normalize part (mathematically)
+    if normalize:
+        gaussian_2D = gaussian_2D / np.sum(gaussian_2D)
+    return gaussian_2D
+
+def get_sobel_kernel(k=3):
+    # get range
+    range = np.linspace(-(k // 2), k // 2, k)
+    # compute a grid the numerator and the axis-distances
+    x, y = np.meshgrid(range, range)
+    sobel_2D_numerator = x
+    sobel_2D_denominator = (x ** 2 + y ** 2)
+    sobel_2D_denominator[:, k // 2] = 1  # avoid division by zero
+    sobel_2D = sobel_2D_numerator / sobel_2D_denominator
+    return sobel_2D
+
+
+def get_thin_kernels(start=0, end=360, step=45):
+        k_thin = 3  # actual size of the directional kernel
+        # increase for a while to avoid interpolation when rotating
+        k_increased = k_thin + 2
+
+        # get 0° angle directional kernel
+        thin_kernel_0 = np.zeros((k_increased, k_increased))
+        thin_kernel_0[k_increased // 2, k_increased // 2] = 1
+        thin_kernel_0[k_increased // 2, k_increased // 2 + 1:] = -1
+
+        # rotate the 0° angle directional kernel to get the other ones
+        thin_kernels = []
+        for angle in range(start, end, step):
+            (h, w) = thin_kernel_0.shape
+            # get the center to not rotate around the (0, 0) coord point
+            center = (w // 2, h // 2)
+            # apply rotation
+            rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1)
+            kernel_angle_increased = cv2.warpAffine(thin_kernel_0, rotation_matrix, (w, h), cv2.INTER_NEAREST)
+
+            # get the k=3 kerne
+            kernel_angle = kernel_angle_increased[1:-1, 1:-1]
+            is_diag = (abs(kernel_angle) == 1)      # because of the interpolation
+            kernel_angle = kernel_angle * is_diag   # because of the interpolation
+            thin_kernels.append(kernel_angle)
+        return thin_kernels
+
+
+def write_to_pt_file(data, filename, print_data=False):
+    torch.save(data, filename)
+    if print_data:
+        print(data) 
+
+
+class CannyFilter(nn.Module):
+    def __init__(self,
+                 k_gaussian=3,
+                 mu=0,
+                 sigma=1,
+                 k_sobel=3,
+                 use_cuda=False):
+        super(CannyFilter, self).__init__()
+        # device
+        self.device = 'cuda' if use_cuda else 'cpu'
+
+        # sobel
+        sobel_2D = get_sobel_kernel(k_sobel)
+        self.sobel_filter_x = nn.Conv2d(in_channels=1,
+                                        out_channels=1,
+                                        kernel_size=k_sobel,
+                                        padding=k_sobel // 2,
+                                        bias=False)
+        self.sobel_filter_y = nn.Conv2d(in_channels=1,
+                                        out_channels=1,
+                                        kernel_size=k_sobel,
+                                        padding=k_sobel // 2,
+                                        bias=False)
+        with torch.no_grad(): 
+            self.sobel_filter_x.weight.copy_(
+            torch.from_numpy(sobel_2D).unsqueeze(0).unsqueeze(0).float())
+        with torch.no_grad(): 
+            self.sobel_filter_y.weight.copy_(
+            torch.from_numpy(sobel_2D.T).unsqueeze(0).unsqueeze(0).float())
+
+
+        # thin
+        thin_kernels = get_thin_kernels()
+        directional_kernels = np.stack(thin_kernels)
+        self.directional_filter = nn.Conv2d(in_channels=1,
+                                            out_channels=8,
+                                            kernel_size=thin_kernels[0].shape,
+                                            padding=thin_kernels[0].shape[-1] // 2,
+                                            bias=False)
+        with torch.no_grad(): 
+            self.directional_filter.weight.copy_(
+            torch.from_numpy(directional_kernels).unsqueeze(1).float())
+
+        # hysteresis
+        hysteresis = np.ones((3, 3)) + 0.25
+        self.hysteresis = nn.Conv2d(in_channels=1,
+                                    out_channels=1,
+                                    kernel_size=3,
+                                    padding=1,
+                                    bias=False)
+        with torch.no_grad(): 
+            self.hysteresis.weight.copy_(
+            torch.from_numpy(hysteresis).unsqueeze(0).unsqueeze(0).float())
+
+
+    def forward(self, img, low_threshold=None, high_threshold=None, hysteresis=False, 
+                use_sa=False, grad_x_sa=0, grad_y_sa=0):
+        # set the setps tensors
+        B, C, H, W = img.shape
+        grad_x = torch.zeros((B, 1, H, W)).to(self.device)
+        grad_y = torch.zeros((B, 1, H, W)).to(self.device)
+        grad_magnitude = torch.zeros((B, 1, H, W)).to(self.device)
+        grad_orientation = torch.zeros((B, 1, H, W)).to(self.device)
+
+        # sobel
+        if use_sa: # caculate the grads with Systolic Array 
+            grad_x = grad_x_sa
+            grad_y = grad_y_sa
+        else: # calculate the grads with Python
+            for c in range(C):
+                soble_result_x = self.sobel_filter_x(img[:, c:c+1])
+                soble_result_y = self.sobel_filter_y(img[:, c:c+1])
+                grad_x = grad_x + soble_result_x
+                grad_y = grad_y + soble_result_y
+                write_to_pt_file(img[:, c:c+1], f'img_{c}.pt')
+                write_to_pt_file(soble_result_x, f'soble_result_x_{c}.pt')
+                write_to_pt_file(soble_result_y, f'soble_result_y_{c}.pt')
+            write_to_pt_file(self.sobel_filter_x.weight, f'soble_filter_x_weight.pt')
+            write_to_pt_file(self.sobel_filter_y.weight, f'soble_filter_y_weight.pt')
+
+        # thick edges
+        grad_x, grad_y = grad_x / C, grad_y / C
+        grad_magnitude = (grad_x ** 2 + grad_y ** 2) ** 0.5
+        grad_orientation = torch.atan(grad_y / grad_x)
+        grad_orientation = grad_orientation * (360 / np.pi) + 180 # convert to degree
+        grad_orientation = torch.round(grad_orientation / 45) * 45  # keep a split by 45
+
+        # thin edges
+        directional = self.directional_filter(grad_magnitude)
+        # get indices of positive and negative directions
+        positive_idx = (grad_orientation / 45) % 8
+        negative_idx = ((grad_orientation / 45) + 4) % 8
+        thin_edges = grad_magnitude.clone()
+        # non maximum suppression direction by direction
+        for pos_i in range(4):
+            neg_i = pos_i + 4
+            # get the oriented grad for the angle
+            is_oriented_i = (positive_idx == pos_i) * 1
+            is_oriented_i = is_oriented_i + (positive_idx == neg_i) * 1
+            pos_directional = directional[:, pos_i]
+            neg_directional = directional[:, neg_i]
+            selected_direction = torch.stack([pos_directional, neg_directional])
+            # get the local maximum pixels for the angle
+            is_max = selected_direction.min(dim=0)[0] > 0.0
+            is_max = torch.unsqueeze(is_max, dim=1)
+            # apply non maximum suppression
+            to_remove = (is_max == 0) * 1 * (is_oriented_i) > 0
+            thin_edges[to_remove] = 0.0
+
+        # thresholds
+        if low_threshold is not None:
+            low = thin_edges > low_threshold
+            if high_threshold is not None:
+                high = thin_edges > high_threshold
+                # get black/gray/white only
+                thin_edges = low * 0.5 + high * 0.5
+                if hysteresis:
+                    # get weaks and check if they are high or not
+                    weak = (thin_edges == 0.5) * 1
+                    weak_is_high = (self.hysteresis(thin_edges) > 1) * weak
+                    thin_edges = high * 1 + weak_is_high * 1
+            else:
+                thin_edges = low * 1
+        return grad_x, grad_y, grad_magnitude, grad_orientation, thin_edges
+
+
+def main():
+    # Load the input image 
+    image = cv2.imread('rubiks_cube.jpg') 
+    image = cv2.resize(image, (256, 256))  # original 256*256
+
+    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+
+    # Convert the image to Torch tensor 
+    img_tensor = torch.from_numpy(image)  #  transform(image)
+    img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0)
+
+    # Run inference
+    model = CannyFilter()
+    grad_x, grad_y, grad_magnitude, grad_orientation, thin_edges = model(img_tensor.float())
+
+    # Save image results
+    cv2.imwrite('edge_rubiks_cube.jpg', grad_magnitude[0].permute(1, 2, 0).detach().numpy())
+
+
+if __name__ == '__main__':
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SiliconJackets Systolic Array
		This notebook goes through the process of design specification, simulation, and implementation of a Systolic Array with open-source tools and PDKs. The parallel computation and data reuse ability of a systolic array is crucial for the acceleration of neural networks, and this notebook with the reusable design aims to contribute to the hardware open-source community to enable more efficient ML applications. This project will explain the principles behind how a systolic array operates 2D convolution, demonstrate the performance of our implementation with image results, and show the final GDS generated with open-source flow. Additionally, to further demonstrate the feasibility of the open-source flow and our design, we are also submitting this systolic array design to the open-source silicon initiative, [Tiny Tapeout](https://tinytapeout.com/). This submission is completed by members of SiliconJackets. We are a student run organization at Georgia Tech that introduces students to semiconductor design, verification, and implementation through a large collaborative project. We are hoping to use this notebook as an example for future members of the club.