Fully parallel pipeline shift-add multiplier

Fully parallel pipeline shift-add multiplier

Basic algorithm

Similar to the shift addition of time division multiplexing, time division multiplexing is cancelled, area is used for time, and pipeline design is used. After the pipeline is filled, a result can be calculated in one clock cycle

  • Calculate the shift result of the multiplier separately, and phase and phase with the multiplicand
  • Add the results using an addition tree

RTL code

Shifted part

The code of the fixed shift unit is as follows. When the nth bit of the multiplicand is 1, the result of shifting the multiplier to the left by n bits is output.

module shift_unit #(
    parameter WIDTH = 4,
    parameter SHIFT_NUM = 0
)(
    input clk,//Clock
    input rst_n,//Asynchronous reset active low
    input shift_valid,
    input shift_mask,
    input [WIDTH-1:0]shift_din,

    output reg [2 * WIDTH-1:0]shift_dout
);

wire [2 * WIDTH-1:0]shift_din_ext;
assign shift_din_ext = {(WIDTH)'(0),shift_din};

always @ (posedge clk or negedge rst_n) begin
    if(~rst_n) begin
        shift_dout <='b0;
    end else if((shift_valid == 1'b1) && (shift_mask == 1'b1)) begin
        shift_dout <= shift_din_ext << SHIFT_NUM;
    end else begin
        shift_dout <='b0;
    end
end

endmodule

The shifter code is as follows, using the generation statement to generate bit-wide shifters

module parallel_shifter #(
    parameter WIDTH = 4
)(
    input clk,//Clock
    input rst_n,//Asynchronous reset active low

    input mult_valid,
    input [WIDTH-1:0]mult1,mult2,

    output [(WIDTH ** 2) * 2-1:0]shift_dout
);

genvar a;
generate
    for (a = 0; a <WIDTH; a = a + 1) begin:shifter_layer
        shift_unit #(
            .WIDTH(WIDTH),
            .SHIFT_NUM(a)
        ) u_shift_unit (
            .clk(clk),//Clock
            .rst_n(rst_n),//Asynchronous reset active low
            .shift_valid(mult_valid),
            .shift_mask(mult2[a]),
            .shift_din(mult1),

            .shift_dout(shift_dout[a * 2 * WIDTH +: 2 * WIDTH])
        );
    end
endgenerate

endmodule

Addition part

The addition part uses the addition tree, which can realize the pipeline operation. The following is the single-layer code for the addition number

module adder_layer #(
    parameter ADDER_NUM = 4,
    parameter ADDER_WIDTH = 8
)(
    input clk,//Clock
    input rst_n,//Asynchronous reset active low
    input [ADDER_NUM * ADDER_WIDTH * 2-1:0]adder_din,

    output [ADDER_NUM * (ADDER_WIDTH + 1)-1:0]adder_dout
);

genvar i;
generate
    for(i = 0;i <ADDER_NUM;i = i + 1) begin:adder_layer_gen
        wire [ADDER_WIDTH-1:0]add1 = adder_din[2 * i * ADDER_WIDTH +: ADDER_WIDTH];
        wire [ADDER_WIDTH-1:0]add2 = adder_din[(2 * i + 1) * ADDER_WIDTH +: ADDER_WIDTH];
        wire [ADDER_WIDTH:0]sum = add1 + add2;
        reg [ADDER_WIDTH:0]sum_reg;
        always @ (posedge clk or negedge rst_n) begin
            if(~rst_n) begin
                sum_reg <='b0;
            end else begin
                sum_reg <= sum;
            end
        end
        assign adder_dout[i * (ADDER_WIDTH + 1) +: ADDER_WIDTH + 1] = sum_reg;
    end
endgenerate

endmodule

The following is the addition tree code

module adder_tree #(
    parameter LAYER_NUM = 4,
    parameter MIN_ADDER_WIDTH = 8
)(
    input clk,//Clock
    input rst_n,//Asynchronous reset active low

    input [(2 ** LAYER_NUM) * MIN_ADDER_WIDTH-1:0]adder_din,
    output [LAYER_NUM + MIN_ADDER_WIDTH-1:0]adder_dout
);

genvar i;
generate
    for(i = LAYER_NUM;i> 0;i = i-1)begin:adder_layer_def
        wire [(2 ** i) * (MIN_ADDER_WIDTH + LAYER_NUM-i)-1:0]layer_din;
        wire [2 ** (i-1) * (MIN_ADDER_WIDTH + LAYER_NUM-i + 1)-1:0]layer_dout;
        if(i == LAYER_NUM) begin
            assign layer_din = adder_din;
        end else begin
            assign layer_din = adder_layer_def[i + 1].layer_dout;
        end
        adder_layer # (
            .ADDER_NUM(2 ** (i-1)),
            .ADDER_WIDTH(MIN_ADDER_WIDTH + LAYER_NUM-i)
        ) u_adder_layer (
            .clk(clk),//Clock
            .rst_n(rst_n),//Asynchronous reset active low
            .adder_din(layer_din),
            .adder_dout(layer_dout)
        );
    end
endgenerate

assign adder_dout = adder_layer_def[1].layer_dout;
endmodule

Top level

The top layer combines an adder and a shifter, the code is as follows

module shift_adder #(
    parameter LOG2_WIDTH = 2
)(
    input clk,//Clock
    input rst_n,//Asynchronous reset active low

    input [2 ** LOG2_WIDTH-1:0]mult1,mult2,
    input din_valid,

    output [(2 ** LOG2_WIDTH) * 2-1:0]dout
);

parameter WIDTH = 2 ** LOG2_WIDTH;

wire [(WIDTH ** 2) * 2-1:0]shift_dout;
parallel_shifter #(
    .WIDTH(WIDTH)
) u_parallel_shifter (
    .clk(clk),//Clock
    .rst_n(rst_n),//Asynchronous reset active low

    .mult_valid(din_valid),
    .mult1(mult1),
    .mult2(mult2),

    .shift_dout(shift_dout)
);

wire [LOG2_WIDTH + 2 * WIDTH:0]adder_dout;
adder_tree #(
    .LAYER_NUM(LOG2_WIDTH),
    .MIN_ADDER_WIDTH(2 * WIDTH)
) u_adder_tree (
    .clk(clk),//Clock
    .rst_n(rst_n),//Asynchronous reset active low

    .adder_din(shift_dout),
    .adder_dout(adder_dout)
);
assign dout = adder_dout[WIDTH * 2-1:0];

endmodule

test

The test platform is completed using sv syntax. Because the multiplier takes a fixed time to complete an operation, there is no effective signal output. After finding the fixed delay, compare it with the *calculated result.

module mult_tb (
);

parameter LOG2_WIDTH = 2;
parameter WIDTH = 2 ** LOG2_WIDTH;

logic clk,rst_n;
logic multiplier_valid;
logic [WIDTH-1:0]multiplier1;
logic [WIDTH-1:0]multiplier2;

logic [2 * WIDTH-1:0]product;

shift_adder #(
    .LOG2_WIDTH(LOG2_WIDTH)
) dut (
    .clk(clk),//Clock
    .rst_n(rst_n),//Asynchronous reset active low

    .mult1(multiplier1),
    .mult2(multiplier2),
    .din_valid(multiplier_valid),

    .dout(product)
);

initial begin
    clk = 1'b0;
    forever begin
        #50 clk = ~clk;
    end
end

initial begin
    rst_n = 1'b1;
    #5 rst_n = 1'b0;
    #10 rst_n = 1'b1;
end

initial begin
    {multiplier_valid,multiplier1,multiplier2} ='b0;
    repeat(100) begin
        @(negedge clk);
        multiplier1 = (WIDTH)'($urandom_range(0,2 ** WIDTH));
        multiplier2 = (WIDTH)'($urandom_range(0,2 ** WIDTH));
        multiplier_valid = 1'b1;
    end
    $stop();
end

reg [WIDTH-1:0]mult11,mult12,mult13;
reg [WIDTH-1:0]mult21,mult22,mult23;
reg [2 * WIDTH-1:0]exp;

always @ (posedge clk or negedge rst_n) begin
    if(~rst_n) begin
        {mult11,mult12,mult13,mult21,mult22,mult23} <='b0;
    end else begin
        mult13 <= mult12;
        mult12 <= mult11;
        mult11 <= multiplier1;

        mult23 <= mult22;
        mult22 <= mult21;
        mult21 <= multiplier2;
    end
end

initial begin
    exp ='b0;
    forever begin
        @(negedge clk);
        exp = mult13 * mult23;
        if(exp == product) begin
            $display("successful");
        end else begin
            $display("fail");
        end
    end
end
endmodule
Reference: https://cloud.tencent.com/developer/article/1110619 Fully Parallel Pipeline Shift Add Multiplier-Cloud + Community-Tencent Cloud