diff --git a/devlog/2025-05-21-Rethink-routing.md b/devlog/2025-05-21-Rethink-routing.md new file mode 100644 index 0000000..1bd5eb8 --- /dev/null +++ b/devlog/2025-05-21-Rethink-routing.md @@ -0,0 +1,91 @@ +# Rethinking the Routing Memory Pool +Date: 2025-05-21 + +## Goals and Expectations +To finish the RX and TX queues. + +## Results +Nope. I'm half way through the TX queue and I'm gonna rework the +entire thing. + +## Thought Train +Separating the TX queue to be per-interface is amazing. But making it +a multi-headed queue is a disaster. In this case, it doesn't simplify +the logic, while taking away one of the benefits of a shared memory +pool. + +Allow me to walk through this: If we have a symmetric design, where +all interfaces send and receive at the same speed, synced, then this +would not have been a problem. But in the real world, the interfaces +won't guarantee that. Which means for the multi-headed queue, I'd +have to implement a separate queue tracking which packets are +complete - one of the reasons why I chose to separate the queues in +the first place. It meant tracking, per interface, which packets are +complete, which is as complex as a shared memory pool. And in the +shared memory pool case, it would've handled bursts better. So, why +not just implement the shared memory pool and let each interface keep +track of the complete packets, and let the central routing logic +handle a bi-directional multi-headed queue where each interface gets a +read and write pointer. + +## Reworking details + +### Rework the central logic +The `hub` would now keep track of the packet queues: + +1. When there's an incoming byte from an interface, throw it in the + appropriate place. If it's a new packet, get a new spot for the + packet or drop it, if it's part of an existing packet (in memory), + append it to that. Benefit: no more buffering packets inside of + the interfaces. +2. When a packet is complete (i.e. reached the pre-agreed length), + parse its header and figure out where to send it, and send a + message to the interface telling it where the packet is. No more + buffering for the header in the interfaces to tell the hub where to + send the packet. + +In addition to that, let the command `000000` always be a command to +the header, with the packet length, and the rest of the packet is all +just info for the hub (who knows why the hub needs at least 63 bytes +of data for a command). This means that there's no need for an +`rx_cmd` section, just let the hub store the entire packet and parse +it later. + +#### IMPORTANT NOTE +There may be the need for reserved memory for to-the-hub commands, +otherwise when the packet queue is full, the hub would drop the +packet. + +#### More notes +The hub will now contain *ALL* the logic for congestion control. If +it's full, toggle a bit to let the interfaces know and start sending +out messages. + +### Rework the interfaces +The interfaces will contain less logic. It only knows the following +things: + +1. Upon receiving a byte, if the hub has space, send it to the hub, + otherwise send back a message telling the device to stop congesting + the fabric. +2. When the hub tells it that another packet for that interface is + ready, start sending it if it's not sending anything else, or add + it to the to-send queue. **NOTE:** Congestion messages are + top-priority, it's always the first packet to check for. + +## Potential problem +If one flow is congesting the fabric, then the entire network would be +congested. However, there are CC methods and we can always have an +upper bound for the TX to-send queues. + +## Reflections +Good planning is still the way. Plan as you go. See the trade-offs. +Also, try, trying would make the plan and the project better. + +It's good that I was able to catch this before I implement the entire +thing. And what I already completed isn't in vain - the logic is +still there, what I learned from doing it is still there, they've just +been repurposed to something else, something more elegant. + +## Next steps +Put the reworking into action. diff --git a/fabric/src/hub.sv b/fabric/src/hub.sv index b5e09b9..d8585fb 100644 --- a/fabric/src/hub.sv +++ b/fabric/src/hub.sv @@ -5,7 +5,7 @@ module hub ( input logic [3:0] rx_cmd_valid, input logic [31:0] rx_byte, input logic [3:0] rx_valid, - input logic [7:0] rx2tx_dest, // rx byte's destination + input logic [31:0] rx2tx_dest, // rx byte's destination input logic [3:0] tx_ready, // if tx_byte is ready to be read output logic [3:0] rx_ready, // if rx_byte is ready to be read output logic [7:0] tx_src, // tell the tx where the stream is comming from @@ -49,7 +49,7 @@ module hub ( for (int i = 0; i < 4; i++) begin if (rx_valid[i]) begin if (!in_buffer[i]) begin - service_buffer[i].dest <= get_dest(rx2tx_dest, i[1:0]); + service_buffer[i].dest <= get_hop(rx2tx_dest, i[1:0]); service_buffer[i].payload <= get_byte(rx_byte, i[1:0]); in_buffer[i] <= 1; end @@ -73,12 +73,24 @@ module hub ( endmodule // hub -function automatic logic [1:0] get_dest(input logic [7:0] dest_map, - input logic [1:0] idx); - return dest_map[{idx, 1'b0} +: 2]; -endfunction // get_dest - function automatic logic [7:0] get_byte(input logic [31:0] byte_arr, input logic [1:0] idx); return byte_arr[{idx, 3'b000} +: 8]; endfunction // get_byte + +// NOTE: addr 0 is alway mapped to the fabric itself and caught before this +function automatic logic [1:0] get_hop(input logic [31:0] dest_map, + input logic [1:0] idx); + case (dest_map[{idx, 3'b000} +: 8]) + 8'b00000001: + return 2'b00; + 8'b00000010: + return 2'b01; + 8'b00000011: + return 2'b10; + 8'b00000100: + return 2'b11; + default: + return 0; + endcase // case (dest_map[{idx, 3'b000} +: 8]) +endfunction // get_hop diff --git a/fabric/src/interface.sv b/fabric/src/interface.sv index 4753c09..a93e18a 100644 --- a/fabric/src/interface.sv +++ b/fabric/src/interface.sv @@ -10,11 +10,12 @@ module spi_interface( input logic tx_valid, input logic [7:0] tx_byte, input logic [1:0] tx_src, + input logic [1:0] packet_size, output logic miso, output logic tx_ready, output logic rx_valid, output logic [7:0] rx_byte, - output logic [1:0] rx_dest, + output logic [7:0] rx_dest, output logic [7:0] rx_cmd, output logic rx_cmd_valid); @@ -31,23 +32,23 @@ module spi_interface( .clk_rising_edge(sclk_rising_edge), .clk_falling_edge(sclk_falling_edge)); - int bit_cnt = 0; + int bit_cnt = 0; logic [7:0] rx_shift; logic [7:0] tx_shift = 8'b00101010; - logic [7:0] tx_buff = '0; + logic [7:0] rx_buff = '0; logic byte_ready = 0; always_ff @ (posedge sclk_rising_edge or posedge rst) begin if (rst) begin rx_shift <= '0; - tx_buff <= '0; + rx_buff <= '0; bit_cnt <= '0; byte_ready <= 0; end else begin if (cs) begin rx_shift <= 0; - tx_buff <= 0; + rx_buff <= 0; bit_cnt <= 0; end else begin rx_shift <= {rx_shift[6:0], mosi}; @@ -55,7 +56,7 @@ module spi_interface( if (bit_cnt == 7) begin bit_cnt <= 0; - tx_buff <= {rx_shift[6:0], mosi}; + rx_buff <= {rx_shift[6:0], mosi}; byte_ready <= 1; end else byte_ready <= 0; @@ -64,7 +65,7 @@ module spi_interface( $display("[%0d] current rx_shift: %b", $time, rx_shift); $display("[%0d] current bit_cnt: %0d", $time, bit_cnt); - $display("[%0d] current tx_buff: %b", $time, tx_buff); + $display("[%0d] current rx_buff: %b", $time, rx_buff); end // always_ff @ (posedge sclk) always_ff @ (posedge sclk_falling_edge) begin @@ -76,7 +77,7 @@ module spi_interface( tx_shift <= 0; end else begin if (bit_cnt == 0) begin - tx_shift <= tx_buff[7:0]; + tx_shift <= rx_buff[7:0]; end else begin tx_shift <= {tx_shift[6:0], 1'b0}; end @@ -89,25 +90,25 @@ module spi_interface( assign miso = tx_shift[7]; - shortint packet_size = 64; - // RX and TX logic - logic [9:0] rx_queue_head = 0; - logic [9:0] rx_queue_tail = 0; + logic [9:0] rx_queue_head = 0; + logic [9:0] rx_queue_tail = 0; logic [10:0] rx_size = 0; logic rx_queue_write = 0; logic [7:0] rx_read; - logic packet_in; + logic [7:0] dest_read; + logic packet_sending; logic rx_queue_empty; assign rx_size = (rx_queue_tail + 11'd1024 - rx_queue_head) & 11'h3FF; assign rx_queue_empty = ~(|rx_size); - bram_1024B rx_queue (.sys_clk(sys_clk), + rx_queue_bram rx_queue (.sys_clk(sys_clk), .write_enable(rx_queue_write), .read_addr(rx_queue_head), .write_addr(rx_queue_tail), - .write_data(tx_buff), - .read_data(rx_read)); + .write_data(rx_buff), + .read_data(rx_read), + .read_dest(dest_read)); always_ff @ (posedge sys_clk) begin if (rst) begin @@ -115,17 +116,36 @@ module spi_interface( rx_queue_tail <= '0; rx_queue_write <= '0; rx_read <= '0; - packet_in <= 0; + packet_sending <= 0; end else begin if (byte_ready) rx_queue_write <= 1; - else + if (rx_queue_write) begin rx_queue_write <= 0; - if (!packet_in && rx_size > 2) begin - // CONSULT internal routing table for directions + rx_queue_tail <= rx_queue_tail + 1; + end + if (!packet_sending) begin + if (rx_size > 2 && rx_ready) begin + rx_byte <= rx_read; + rx_dest <= dest_read; + rx_valid <= 1; + end else + rx_valid <= 0; + end else begin + if (is_packet_complete(rx_queue_head, packet_size)) + packet_sending <= 0; + else if (rx_size > 0) begin + rx_byte <= rx_read; + rx_dest <= dest_read; + rx_valid <= 1; + end end end - end + end // always_ff @ (posedge sys_clk) + + logic [13:0] tx_queue_head; + logic [13:0] tx_queue_tail; + endmodule // spi_interface @@ -166,23 +186,58 @@ module async_get_clk_edges( `endif // !`ifdef SYNC_2FF endmodule // async_get_clk_edges -module bram_1024B ( +module rx_queue_bram ( input logic sys_clk, input logic write_enable, input logic [9:0] read_addr, input logic [9:0] write_addr, input logic [7:0] write_data, - output logic [7:0] read_data); + output logic [7:0] read_data, + output logic [7:0] read_dest); timeunit 1ns; timeprecision 1ps; - logic [7:0] mem [0:1023]; + logic [7:0] mem [1023:0]; - always_ff @(posedge sys_clk) begin + always_ff @ (posedge sys_clk) begin + if (write_enable) + mem[write_addr] <= write_data; + read_data <= mem[read_addr]; + read_dest <= mem[read_addr + 1]; + end + +endmodule // rx_queue_bram + +module tx_queue_bram(input logic sys_clk, + input logic write_enable, + input logic [13:0] read_addr, + input logic [13:0] write_addr, + input logic [7:0] write_data, + output logic [7:0] read_data); + timeunit 1ns; + timeprecision 1ps; + + logic [7:0] mem [16 * 1023:0]; + + always_ff @ (posedge sys_clk) begin if (write_enable) mem[write_addr] <= write_data; read_data <= mem[read_addr]; end + +endmodule // tx_queue_bram -endmodule // bram_1024B +function automatic logic is_packet_complete(input logic [9:0] head, + input logic [1:0] packet_size); + case(packet_size) + 2'b00: + return &(head & 'd64); + 2'b01: + return &(head & 'd128); + 2'b10: + return &(head & 'd256); + 2'b11: + return &head; + endcase // case (packet_size) +endfunction // packet_complete