From 33aaa43fa0f16b893ede7ed1657cbadbe345dcd4 Mon Sep 17 00:00:00 2001 From: Peisong Xiao Date: Sat, 7 Jun 2025 01:01:00 -0400 Subject: [PATCH] completed fabric logic (untested) --- devlog/2025-06-06-SPI-Ready.md | 86 +++++++++ src/fabric/hub.sv | 31 ++-- src/fabric/interface.sv | 308 ++++++++++++++++++++++++++++----- src/fabric/params.svh | 5 +- 4 files changed, 365 insertions(+), 65 deletions(-) create mode 100644 devlog/2025-06-06-SPI-Ready.md diff --git a/devlog/2025-06-06-SPI-Ready.md b/devlog/2025-06-06-SPI-Ready.md new file mode 100644 index 0000000..60652fc --- /dev/null +++ b/devlog/2025-06-06-SPI-Ready.md @@ -0,0 +1,86 @@ +# Fabric's Basic Logic Done +Date: 2025-06-06 + +## Goals and expectations +Complete the fabric's code. + +## Results +Done. + +And also revised the directory layout (swapped `fabric/` and `src/`). + +As a bonus, I also modularized the interfaces' code, so that it looks +a bit cleaner than it used to. + +And as a second bonus, I completed the drop logic. + +## Packet dropping +Packet dropping can be caused by two issues: + +1. The packet memory is full, in which case the hub will notify all + interfaces to drop their packets. +2. The destination's packet queue is full, in which case the hub will + drop the packet by immediately reclaiming the slot via the reuse + logic. + +There's a few scenarios for which this can play out. Notably, if both +the packet memory and the destination's packet queue is full, and the +destination was full before the packet memory, then the **hub** will +drop the packet instead of the source interface. + +## Reflections +1. Keep the momentum alive. I kept the momentum I gained yesterday + alive by actively coding today, this helped me pick up my previous + thoughts. +2. Small rewards go a long way. I recently showed some people the + ROSE repo, they didn't read much code, but they were obsessed with + the devlogs and the README and how organized the entire project is. + This boosted my confidence in thinking that ROSE could be a + game changer for me - I wanted it to become something that people + can appreciate, even if they're not devs, even if they don't read + the code. I enjoyed their praise, and I'm preparing myself to + uphold that. +3. Simplify the logic. Much of today's code was linking different + modules together and having internal states within the modules to + manage the operations. A lot of times, the data is already there, + and there's no reason to stall it for an additional cycle to buffer + it. +4. HDL is better for design meant for hardware. This isn't + ROSE-specific, but rather a thought that I came across while + implementing the new ARCANE load balancing algorithm for an `ns-3` + sim. It would've been much cleaner to implement a + hardware-targeting algorithm in HDL than in a language like C++. + For example, the three states of ARCANE (explore, normal and + freeze) would be perfectly implemented by an FSM. While the C++ + sim would have to schedule timeouts using very explicit (and + seemingly unnecessarily complex) code, SystemVerilog would embrace + that by design. + +## Final thoughts +Finally! After more than a month of intermittent coding, I'm close to +completing a viable solution for the fabric. Just some sim tests and +I can begin working on the Linux code for the hosts. + +I love coding, I love writing my thoughts down, in code or in plain +words. It's the proof of my existence. I'm here to leave a mark, for +those who'd like to see how I do it. + +> C'est cela l'amour, tout donner, tout sacrifier sans espoir de +> retour. + +### What is such a thing as love that doesn't hope for anything in return? +It's that the love is for a bigger picture. We'd always hope for +something in return, whether that's tangible or intangible, whether +that's spontaneous or given to by someone else. But the scale of the +hope is different when we have love. With it, we see a bigger +picture, we see how that love is impulsive, how that love drives us +forward, and we see how in the broader view, it makes us better. + +## Next steps +Testing. I'm sure when I throw real traffic at my code, it will go +wrong in ways. + +Notably, I should test the drop logic with different scenarios: +bursts, incast, and heavy background flows. Simulating congestion is +my thing (but, of course, we won't be running something as complex as +an AllReduce workload). diff --git a/src/fabric/hub.sv b/src/fabric/hub.sv index 827f71f..b0ecccb 100644 --- a/src/fabric/hub.sv +++ b/src/fabric/hub.sv @@ -2,8 +2,7 @@ `include // IMPORTANT: interfaces are supposed to keep track of their own packet states -module hub( - input logic rst, +module hub(input logic rst, input logic sys_clk, input logic [7:0] rx_byte [INTERFACE_CNT], input logic rx_valid [INTERFACE_CNT], @@ -16,8 +15,7 @@ module hub( input logic [QUEUE_ADDR_LEN - 1:0] tx_queue_addr [INTERFACE_CNT], output logic [QUEUE_ADDR_LEN - 1:0] tx_new_queue[INTERFACE_CNT], output logic tx_new_queue_valid [INTERFACE_CNT], - input logic tx_new_queue_ready [INTERFACE_CNT], - output logic free_queue_empty); + input logic tx_new_queue_ready [INTERFACE_CNT]); timeunit 1ns; timeprecision 1ps; @@ -30,7 +28,8 @@ module hub( logic [QUEUE_ADDR_LEN - 1:0] new_slot_addr; logic [QUEUE_ADDR_LEN - 1:0] empty_slot_addr; logic empty_slot_enqueue; - + logic free_queue_empty; + free_queue fqueue(.sys_clk(sys_clk), .rst(rst), .request_new_slot(request_new_slot), @@ -54,6 +53,12 @@ module hub( .write_byte(mem_write_byte), .write_enable(mem_write_enable), .read_byte(mem_read_byte)); + + always_comb begin + foreach (rx_ready[i]) begin + assign rx_ready[i] = !free_queue_empty || reuse_queue_slot[i]; + end + end always_ff @ (posedge sys_clk or rst) begin if (rst) begin @@ -75,23 +80,17 @@ module hub( mem_write_byte <= '0; end else begin // NOTE: signaled the servicing interface in the last cycle - rx_ready[curr] <= 0; - rx_ready[curr + 1] <= 1; tx_new_queue_valid[dest_buff[curr - 1]] <= 0; - // IMPORTANT: interfaces should send the byte no matter what, rx_ready is to prevent sending a new byte + // IMPORTANT: interfaces should send the byte no matter what, rx_ready is to prevent starting a new packet if (rx_valid[curr]) begin // IMPORTANT: memory_write_addr is ready on the next cycle if (rx_pkt_addr[curr] == 0 && !reuse_queue_slot[curr] && !(|new_slot_cooldown[curr])) begin - if (free_queue_empty) begin - // TODO: handle the drop logic - end else begin - request_new_slot <= 1; - rx_queue_addr[curr] <= new_slot_addr; - mem_write_addr <= {new_slot_addr, rx_pkt_addr[curr]}; - new_slot_cooldown[curr] <= NEW_SLOT_COOLDOWN; - end + request_new_slot <= 1; + rx_queue_addr[curr] <= new_slot_addr; + mem_write_addr <= {new_slot_addr, rx_pkt_addr[curr]}; + new_slot_cooldown[curr] <= NEW_SLOT_COOLDOWN; end else begin // if (rx_new_packet[curr]) reuse_queue_slot[curr] <= 0; mem_write_addr <= {rx_queue_addr[curr], rx_pkt_addr[curr]}; diff --git a/src/fabric/interface.sv b/src/fabric/interface.sv index 77acfb4..d16cfd6 100644 --- a/src/fabric/interface.sv +++ b/src/fabric/interface.sv @@ -1,16 +1,16 @@ // NOTE: The first byte is used for syncing due to using different clock domains `include -module spi_interface( - input logic rst, +module spi_interface(input logic rst, input logic sys_clk, - input logic mosi, - input logic cs, input logic sclk, + input logic cs, + input logic mosi, output logic miso, output logic [7:0] rx_byte, output logic rx_valid, input logic rx_ready, output logic [PACKET_ADDR_LEN - 1:0] rx_pkt_addr, + output logic tx_active, input logic [7:0] tx_byte, input logic tx_valid, output logic tx_ready, @@ -18,8 +18,7 @@ module spi_interface( output logic [QUEUE_ADDR_LEN - 1:0] tx_queue_addr, input logic [QUEUE_ADDR_LEN - 1:0] tx_new_queue, input logic tx_new_queue_valid, - output logic tx_new_queue_ready, - input logic free_queue_empty); + output logic tx_new_queue_ready); timeunit 1ns; timeprecision 1ps; @@ -33,19 +32,77 @@ module spi_interface( .sys_clk(sys_clk), .clk_rising_edge(sclk_rising_edge), .clk_falling_edge(sclk_falling_edge)); + logic [7:0] rx_buff; + shortint bit_cnt; + logic rx_byte_ready; + logic [7:0] tx_buff; + logic tx_buff_valid; + logic tx_loaded; + + spi spi_module(.rst(rst), + .sclk_rising_edge(sclk_rising_edge), + .sclk_falling_edge(sclk_falling_edge), + .cs(cs), + .mosi(mosi), + .miso(miso), + .bit_cnt(bit_cnt), + .rx_buff(rx_buff), + .rx_byte_ready(rx_byte_ready), + .tx_buff(tx_buff), + .tx_buff_valid(tx_buff_valid), + .tx_loaded(tx_loaded)); + + rx_hub rx_module(.rst(rst), + .sys_clk(sys_clk), + .rx_buff(rx_buff), + .rx_byte_ready(rx_byte_ready), + .rx_byte(rx_byte), + .rx_valid(rx_valid), + .rx_ready(rx_ready), + .rx_pkt_addr(rx_pkt_addr)); + + assign tx_buff = tx_byte; + + tx_hub tx_module(.rst(rst), + .sys_clk(sys_clk), + .tx_active(tx_active), + .tx_loaded(tx_loaded), + .tx_byte(tx_byte), + .tx_valid(tx_valid), + .tx_ready(tx_ready), + .tx_pkt_addr(tx_pkt_addr), + .tx_queue_addr(tx_queue_addr), + .tx_new_queue(tx_new_queue), + .tx_new_queue_valid(tx_new_queue_valid), + .tx_new_queue_ready(tx_new_queue_ready)); + +endmodule // spi_interface + +module spi(input logic rst, + input logic sclk_rising_edge, + input logic sclk_falling_edge, + input logic cs, + input logic mosi, + output logic miso, + output shortint bit_cnt, + output logic [7:0] rx_buff, + output logic rx_byte_ready, + input logic tx_buff_valid, + input logic [7:0] tx_buff, + output logic tx_loaded); + timeunit 1ns; + timeprecision 1ps; - shortint bit_cnt = 0; logic [7:0] rx_shift; - logic [7:0] tx_shift = 8'b00101010; - logic [7:0] rx_buff = '0; - logic byte_ready = 0; + logic [7:0] tx_shift; + assign miso = tx_shift[7]; always_ff @ (posedge sclk_rising_edge or posedge rst) begin if (rst) begin rx_shift <= '0; rx_buff <= '0; bit_cnt <= '0; - byte_ready <= 0; + rx_byte_ready <= 0; end else begin if (cs) begin @@ -59,53 +116,33 @@ module spi_interface( if (bit_cnt == 7) begin bit_cnt <= 0; rx_buff <= {rx_shift[6:0], mosi}; - byte_ready <= 1; + rx_byte_ready <= 1; end else begin - byte_ready <= 0; + rx_byte_ready <= 0; end end // else: !if(cs) end // else: !if(rst) end // always_ff @ (posedge sclk) - shortint idle_cntdn; - logic rx_drained; - - always_ff @ (posedge sys_clk or rst) begin - if (rst) begin - rx_drained <= 0; - rx_pkt_addr <= '1; - rx_byte <= '0; - rx_valid <= 0; - idle_cntdn <= 0; - end else begin - if (!rx_drained && byte_ready) begin - rx_byte <= rx_buff; - rx_valid <= 1; - idle_cntdn <= INTERFACE_IDLE_COUNTDOWN; - rx_drained <= 1; - rx_pkt_addr <= rx_pkt_addr + 1; - end else if (!byte_ready) begin - rx_drained <= 0; - if (!(|idle_cntdn)) begin - rx_valid <= 0; - end else begin - idle_cntdn <= idle_cntdn - 1; - end - end - end - end - always_ff @ (posedge sclk_falling_edge or rst) begin if (rst) begin - tx_shift <= 8'b00101010; + tx_shift <= TX_DEFAULT; + tx_loaded <= 0; end else begin if (cs) begin - tx_shift <= 0; + tx_shift <= TX_DEFAULT; end else begin if (bit_cnt == 0) begin - tx_shift <= rx_buff[7:0]; + if (tx_buff_valid) begin + tx_shift <= tx_buff; + tx_loaded <= 1; + end else begin + tx_shift <= '0; + tx_loaded <= 1; + end end else begin tx_shift <= {tx_shift[6:0], 1'b0}; + tx_loaded <= 0; end end end // else: !if(rst) @@ -114,13 +151,146 @@ module spi_interface( $display("-----------------------------------------"); end // always_ff @ (negedge sclk) - assign miso = tx_shift[7]; +endmodule // spi +module rx_hub(input logic rst, + input logic sys_clk, + input logic [7:0] rx_buff, + input logic rx_byte_ready, + output logic [7:0] rx_byte, + output logic rx_valid, + input logic rx_ready, + output logic [PACKET_ADDR_LEN - 1:0] rx_pkt_addr); + timeunit 1ns; + timeprecision 1ps; -endmodule // spi_interface + shortint rx_load_cooldown; + logic rx_drained; + logic rx_dropping; -module async_get_clk_edges( - input logic rst, + always_ff @ (posedge sys_clk or rst) begin + if (rst) begin + rx_drained <= 0; + rx_pkt_addr <= '1; + rx_byte <= '0; + rx_valid <= 0; + rx_load_cooldown <= 0; + rx_dropping <= 0; + end else begin + if (!rx_drained && rx_byte_ready) begin + if (rx_ready && !rx_dropping) begin + rx_byte <= rx_buff; + rx_valid <= 1; + rx_load_cooldown <= RX_LOAD_COOLDOWN; + rx_drained <= 1; + rx_pkt_addr <= rx_pkt_addr + 1; + end else begin + rx_dropping <= 1; + end + if (rx_ready && &rx_pkt_addr) begin + rx_dropping <= 0; + end + end else if (!rx_byte_ready) begin + rx_drained <= 0; + if (rx_load_cooldown == 0) begin + rx_valid <= 0; + end else begin + rx_load_cooldown <= rx_load_cooldown - 1; + end + end + end + end // always_ff @ (posedge sys_clk or rst) +endmodule // rx_hub + +module tx_hub(input logic rst, + input logic sys_clk, + output logic tx_active, + input logic tx_loaded, + input logic [7:0] tx_byte, + input logic tx_valid, + output logic tx_ready, + output logic [PACKET_ADDR_LEN - 1:0] tx_pkt_addr, + output logic [QUEUE_ADDR_LEN - 1:0] tx_queue_addr, + input logic [QUEUE_ADDR_LEN - 1:0] tx_new_queue, + input logic tx_new_queue_valid, + output logic tx_new_queue_ready); + timeunit 1ns; + timeprecision 1ps; + + shortint tx_load_cooldown; + assign tx_new_queue_ready = !pkt_queue_full; + + logic queue_active; + assign tx_active = queue_active || queue_addr_valid; + + always_ff @ (posedge sys_clk or rst) begin + if (rst) begin + tx_ready <= 0; + tx_pkt_addr <= '0; + enqueue_addr <= 0; + new_queue_addr <= '0; + request_new_pkt <= 0; + queue_active <= 0; + tx_pkt_addr <= '0; + tx_load_cooldown <= 0; + end else begin // if (rst) + if (tx_new_queue_valid && tx_new_queue_ready) begin + enqueue_addr <= 1; + new_queue_addr <= tx_new_queue; + end else begin + enqueue_addr <= 0; + end + + if (!queue_active && queue_addr_valid) begin + request_new_pkt <= 1; + queue_active <= 1; + tx_pkt_addr <= '1; + end else begin + request_new_pkt <= 0; + end + + if (queue_active) begin + if (tx_loaded && tx_load_cooldown == 0) begin + tx_ready <= 1; + tx_load_cooldown <= TX_LOAD_COOLDOWN; + tx_pkt_addr <= tx_pkt_addr + 1; + if (&tx_pkt_addr) begin + queue_active <= 0; + end + end else begin + tx_ready <= 0; + if (tx_load_cooldown != 0) begin + tx_load_cooldown <= tx_load_cooldown - 1; + end + end + end else begin // if (queue_active) + tx_pkt_addr <= '1; + if (tx_load_cooldown != 0) begin + tx_load_cooldown <= tx_load_cooldown - 1; + end + end + end // else: !if(rst) + end // always_ff @ (posedge sys_clk or rst) + + logic request_new_pkt; + logic [QUEUE_ADDR_LEN - 1:0] new_queue_addr; + logic enqueue_addr; + logic queue_addr_valid; + logic pkt_queue_full; + + // IMPORTANT: tx_queue_addr is directly linked + packet_queue pkt_q (.rst(rst), + .sys_clk(sys_clk), + .request_new_packet(request_new_pkt), + .new_queue_addr(new_queue_addr), + .enqueue_addr(enqueue_addr), + .next_queue_addr(tx_queue_addr), + .next_queue_addr_valid(queue_addr_valid), + .queue_full(pkt_queue_full)); + +endmodule // tx_hub + +module async_get_clk_edges(input logic rst, input logic ext_clk, input logic sys_clk, output logic clk_rising_edge, @@ -144,3 +314,45 @@ module async_get_clk_edges( assign clk_rising_edge = sync_0 & ~sync_1; assign clk_falling_edge = ~sync_0 & sync_1; endmodule // async_get_clk_edges + +// IMPORTANT: new_queue_addr will always be ready when queue_size is greater than 1 +module packet_queue(input logic rst, + input logic sys_clk, + input logic request_new_packet, + input logic [QUEUE_ADDR_LEN - 1:0] new_queue_addr, + input logic enqueue_addr, + output logic [QUEUE_ADDR_LEN - 1:0] next_queue_addr, + output logic next_queue_addr_valid, + output logic queue_full); + timeunit 1ns; + timeprecision 1ps; + + logic [QUEUE_ADDR_LEN - 1:0] mem [INTERFACE_QUEUE_SIZE]; + logic [INTERFACE_QUEUE_ADDR_LEN - 1:0] head; + logic [INTERFACE_QUEUE_ADDR_LEN - 1:0] tail; + shortint queue_size; + assign queue_full = queue_size == INTERFACE_QUEUE_SIZE[31:16]; + assign next_queue_addr_valid = !(queue_size == 0); + + always_ff @ (posedge sys_clk or rst) begin + if (rst) begin + head <= '0; + tail <= '0; + queue_size <= 0; + next_queue_addr <= '0; + end else begin + if (request_new_packet) begin + head <= head + 1; + queue_size <= queue_size - 1; + end + next_queue_addr <= mem[head]; + + if (enqueue_addr) begin + mem[tail] <= new_queue_addr; + tail <= tail + 1; + queue_size <= queue_size + 1; + end + end + end + +endmodule // packet_queue diff --git a/src/fabric/params.svh b/src/fabric/params.svh index 23800ca..882765c 100644 --- a/src/fabric/params.svh +++ b/src/fabric/params.svh @@ -7,6 +7,8 @@ parameter int PACKET_ADDR_LEN = 6; parameter int ROSE_ADDR_LEN = 8; parameter logic [PACKET_ADDR_LEN - 1:0] ROSE_DEST_INDEX = 1; +parameter logic [7:0] TX_DEFAULT = 8'b00101010; + parameter shortint QUEUE_SIZE = 1024; parameter int QUEUE_ADDR_LEN = 10; parameter int MEMORY_POOL_SIZE = QUEUE_SIZE * PACKET_SIZE; @@ -18,6 +20,7 @@ parameter int INTERFACE_CNT = 4; parameter int INTERFACE_ADDR_LEN = 2; parameter int CRC_BITS = 8; parameter shortint NEW_SLOT_COOLDOWN = 500; -parameter shortint INTERFACE_IDLE_COUNTDOWN = 4; +parameter shortint RX_LOAD_COOLDOWN = 4; +parameter shortint TX_LOAD_COOLDOWN = 4; `endif