completed fabric logic (untested)

This commit is contained in:
2025-06-07 01:01:00 -04:00
parent 018b7a3fcf
commit 33aaa43fa0
4 changed files with 365 additions and 65 deletions

View File

@ -0,0 +1,86 @@
# Fabric's Basic Logic Done
Date: 2025-06-06
## Goals and expectations
Complete the fabric's code.
## Results
Done.
And also revised the directory layout (swapped `fabric/` and `src/`).
As a bonus, I also modularized the interfaces' code, so that it looks
a bit cleaner than it used to.
And as a second bonus, I completed the drop logic.
## Packet dropping
Packet dropping can be caused by two issues:
1. The packet memory is full, in which case the hub will notify all
interfaces to drop their packets.
2. The destination's packet queue is full, in which case the hub will
drop the packet by immediately reclaiming the slot via the reuse
logic.
There's a few scenarios for which this can play out. Notably, if both
the packet memory and the destination's packet queue is full, and the
destination was full before the packet memory, then the **hub** will
drop the packet instead of the source interface.
## Reflections
1. Keep the momentum alive. I kept the momentum I gained yesterday
alive by actively coding today, this helped me pick up my previous
thoughts.
2. Small rewards go a long way. I recently showed some people the
ROSE repo, they didn't read much code, but they were obsessed with
the devlogs and the README and how organized the entire project is.
This boosted my confidence in thinking that ROSE could be a
game changer for me - I wanted it to become something that people
can appreciate, even if they're not devs, even if they don't read
the code. I enjoyed their praise, and I'm preparing myself to
uphold that.
3. Simplify the logic. Much of today's code was linking different
modules together and having internal states within the modules to
manage the operations. A lot of times, the data is already there,
and there's no reason to stall it for an additional cycle to buffer
it.
4. HDL is better for design meant for hardware. This isn't
ROSE-specific, but rather a thought that I came across while
implementing the new ARCANE load balancing algorithm for an `ns-3`
sim. It would've been much cleaner to implement a
hardware-targeting algorithm in HDL than in a language like C++.
For example, the three states of ARCANE (explore, normal and
freeze) would be perfectly implemented by an FSM. While the C++
sim would have to schedule timeouts using very explicit (and
seemingly unnecessarily complex) code, SystemVerilog would embrace
that by design.
## Final thoughts
Finally! After more than a month of intermittent coding, I'm close to
completing a viable solution for the fabric. Just some sim tests and
I can begin working on the Linux code for the hosts.
I love coding, I love writing my thoughts down, in code or in plain
words. It's the proof of my existence. I'm here to leave a mark, for
those who'd like to see how I do it.
> C'est cela l'amour, tout donner, tout sacrifier sans espoir de
> retour.
### What is such a thing as love that doesn't hope for anything in return?
It's that the love is for a bigger picture. We'd always hope for
something in return, whether that's tangible or intangible, whether
that's spontaneous or given to by someone else. But the scale of the
hope is different when we have love. With it, we see a bigger
picture, we see how that love is impulsive, how that love drives us
forward, and we see how in the broader view, it makes us better.
## Next steps
Testing. I'm sure when I throw real traffic at my code, it will go
wrong in ways.
Notably, I should test the drop logic with different scenarios:
bursts, incast, and heavy background flows. Simulating congestion is
my thing (but, of course, we won't be running something as complex as
an AllReduce workload).

View File

@ -2,8 +2,7 @@
`include <routing.svh>
// IMPORTANT: interfaces are supposed to keep track of their own packet states
module hub(
input logic rst,
module hub(input logic rst,
input logic sys_clk,
input logic [7:0] rx_byte [INTERFACE_CNT],
input logic rx_valid [INTERFACE_CNT],
@ -16,8 +15,7 @@ module hub(
input logic [QUEUE_ADDR_LEN - 1:0] tx_queue_addr [INTERFACE_CNT],
output logic [QUEUE_ADDR_LEN - 1:0] tx_new_queue[INTERFACE_CNT],
output logic tx_new_queue_valid [INTERFACE_CNT],
input logic tx_new_queue_ready [INTERFACE_CNT],
output logic free_queue_empty);
input logic tx_new_queue_ready [INTERFACE_CNT]);
timeunit 1ns;
timeprecision 1ps;
@ -30,7 +28,8 @@ module hub(
logic [QUEUE_ADDR_LEN - 1:0] new_slot_addr;
logic [QUEUE_ADDR_LEN - 1:0] empty_slot_addr;
logic empty_slot_enqueue;
logic free_queue_empty;
free_queue fqueue(.sys_clk(sys_clk),
.rst(rst),
.request_new_slot(request_new_slot),
@ -54,6 +53,12 @@ module hub(
.write_byte(mem_write_byte),
.write_enable(mem_write_enable),
.read_byte(mem_read_byte));
always_comb begin
foreach (rx_ready[i]) begin
assign rx_ready[i] = !free_queue_empty || reuse_queue_slot[i];
end
end
always_ff @ (posedge sys_clk or rst) begin
if (rst) begin
@ -75,23 +80,17 @@ module hub(
mem_write_byte <= '0;
end else begin
// NOTE: signaled the servicing interface in the last cycle
rx_ready[curr] <= 0;
rx_ready[curr + 1] <= 1;
tx_new_queue_valid[dest_buff[curr - 1]] <= 0;
// IMPORTANT: interfaces should send the byte no matter what, rx_ready is to prevent sending a new byte
// IMPORTANT: interfaces should send the byte no matter what, rx_ready is to prevent starting a new packet
if (rx_valid[curr]) begin
// IMPORTANT: memory_write_addr is ready on the next cycle
if (rx_pkt_addr[curr] == 0 && !reuse_queue_slot[curr] &&
!(|new_slot_cooldown[curr])) begin
if (free_queue_empty) begin
// TODO: handle the drop logic
end else begin
request_new_slot <= 1;
rx_queue_addr[curr] <= new_slot_addr;
mem_write_addr <= {new_slot_addr, rx_pkt_addr[curr]};
new_slot_cooldown[curr] <= NEW_SLOT_COOLDOWN;
end
request_new_slot <= 1;
rx_queue_addr[curr] <= new_slot_addr;
mem_write_addr <= {new_slot_addr, rx_pkt_addr[curr]};
new_slot_cooldown[curr] <= NEW_SLOT_COOLDOWN;
end else begin // if (rx_new_packet[curr])
reuse_queue_slot[curr] <= 0;
mem_write_addr <= {rx_queue_addr[curr], rx_pkt_addr[curr]};

View File

@ -1,16 +1,16 @@
// NOTE: The first byte is used for syncing due to using different clock domains
`include <params.svh>
module spi_interface(
input logic rst,
module spi_interface(input logic rst,
input logic sys_clk,
input logic mosi,
input logic cs,
input logic sclk,
input logic cs,
input logic mosi,
output logic miso,
output logic [7:0] rx_byte,
output logic rx_valid,
input logic rx_ready,
output logic [PACKET_ADDR_LEN - 1:0] rx_pkt_addr,
output logic tx_active,
input logic [7:0] tx_byte,
input logic tx_valid,
output logic tx_ready,
@ -18,8 +18,7 @@ module spi_interface(
output logic [QUEUE_ADDR_LEN - 1:0] tx_queue_addr,
input logic [QUEUE_ADDR_LEN - 1:0] tx_new_queue,
input logic tx_new_queue_valid,
output logic tx_new_queue_ready,
input logic free_queue_empty);
output logic tx_new_queue_ready);
timeunit 1ns;
timeprecision 1ps;
@ -33,19 +32,77 @@ module spi_interface(
.sys_clk(sys_clk),
.clk_rising_edge(sclk_rising_edge),
.clk_falling_edge(sclk_falling_edge));
logic [7:0] rx_buff;
shortint bit_cnt;
logic rx_byte_ready;
logic [7:0] tx_buff;
logic tx_buff_valid;
logic tx_loaded;
spi spi_module(.rst(rst),
.sclk_rising_edge(sclk_rising_edge),
.sclk_falling_edge(sclk_falling_edge),
.cs(cs),
.mosi(mosi),
.miso(miso),
.bit_cnt(bit_cnt),
.rx_buff(rx_buff),
.rx_byte_ready(rx_byte_ready),
.tx_buff(tx_buff),
.tx_buff_valid(tx_buff_valid),
.tx_loaded(tx_loaded));
rx_hub rx_module(.rst(rst),
.sys_clk(sys_clk),
.rx_buff(rx_buff),
.rx_byte_ready(rx_byte_ready),
.rx_byte(rx_byte),
.rx_valid(rx_valid),
.rx_ready(rx_ready),
.rx_pkt_addr(rx_pkt_addr));
assign tx_buff = tx_byte;
tx_hub tx_module(.rst(rst),
.sys_clk(sys_clk),
.tx_active(tx_active),
.tx_loaded(tx_loaded),
.tx_byte(tx_byte),
.tx_valid(tx_valid),
.tx_ready(tx_ready),
.tx_pkt_addr(tx_pkt_addr),
.tx_queue_addr(tx_queue_addr),
.tx_new_queue(tx_new_queue),
.tx_new_queue_valid(tx_new_queue_valid),
.tx_new_queue_ready(tx_new_queue_ready));
endmodule // spi_interface
module spi(input logic rst,
input logic sclk_rising_edge,
input logic sclk_falling_edge,
input logic cs,
input logic mosi,
output logic miso,
output shortint bit_cnt,
output logic [7:0] rx_buff,
output logic rx_byte_ready,
input logic tx_buff_valid,
input logic [7:0] tx_buff,
output logic tx_loaded);
timeunit 1ns;
timeprecision 1ps;
shortint bit_cnt = 0;
logic [7:0] rx_shift;
logic [7:0] tx_shift = 8'b00101010;
logic [7:0] rx_buff = '0;
logic byte_ready = 0;
logic [7:0] tx_shift;
assign miso = tx_shift[7];
always_ff @ (posedge sclk_rising_edge or posedge rst) begin
if (rst) begin
rx_shift <= '0;
rx_buff <= '0;
bit_cnt <= '0;
byte_ready <= 0;
rx_byte_ready <= 0;
end
else begin
if (cs) begin
@ -59,53 +116,33 @@ module spi_interface(
if (bit_cnt == 7) begin
bit_cnt <= 0;
rx_buff <= {rx_shift[6:0], mosi};
byte_ready <= 1;
rx_byte_ready <= 1;
end else begin
byte_ready <= 0;
rx_byte_ready <= 0;
end
end // else: !if(cs)
end // else: !if(rst)
end // always_ff @ (posedge sclk)
shortint idle_cntdn;
logic rx_drained;
always_ff @ (posedge sys_clk or rst) begin
if (rst) begin
rx_drained <= 0;
rx_pkt_addr <= '1;
rx_byte <= '0;
rx_valid <= 0;
idle_cntdn <= 0;
end else begin
if (!rx_drained && byte_ready) begin
rx_byte <= rx_buff;
rx_valid <= 1;
idle_cntdn <= INTERFACE_IDLE_COUNTDOWN;
rx_drained <= 1;
rx_pkt_addr <= rx_pkt_addr + 1;
end else if (!byte_ready) begin
rx_drained <= 0;
if (!(|idle_cntdn)) begin
rx_valid <= 0;
end else begin
idle_cntdn <= idle_cntdn - 1;
end
end
end
end
always_ff @ (posedge sclk_falling_edge or rst) begin
if (rst) begin
tx_shift <= 8'b00101010;
tx_shift <= TX_DEFAULT;
tx_loaded <= 0;
end else begin
if (cs) begin
tx_shift <= 0;
tx_shift <= TX_DEFAULT;
end else begin
if (bit_cnt == 0) begin
tx_shift <= rx_buff[7:0];
if (tx_buff_valid) begin
tx_shift <= tx_buff;
tx_loaded <= 1;
end else begin
tx_shift <= '0;
tx_loaded <= 1;
end
end else begin
tx_shift <= {tx_shift[6:0], 1'b0};
tx_loaded <= 0;
end
end
end // else: !if(rst)
@ -114,13 +151,146 @@ module spi_interface(
$display("-----------------------------------------");
end // always_ff @ (negedge sclk)
assign miso = tx_shift[7];
endmodule // spi
module rx_hub(input logic rst,
input logic sys_clk,
input logic [7:0] rx_buff,
input logic rx_byte_ready,
output logic [7:0] rx_byte,
output logic rx_valid,
input logic rx_ready,
output logic [PACKET_ADDR_LEN - 1:0] rx_pkt_addr);
timeunit 1ns;
timeprecision 1ps;
endmodule // spi_interface
shortint rx_load_cooldown;
logic rx_drained;
logic rx_dropping;
module async_get_clk_edges(
input logic rst,
always_ff @ (posedge sys_clk or rst) begin
if (rst) begin
rx_drained <= 0;
rx_pkt_addr <= '1;
rx_byte <= '0;
rx_valid <= 0;
rx_load_cooldown <= 0;
rx_dropping <= 0;
end else begin
if (!rx_drained && rx_byte_ready) begin
if (rx_ready && !rx_dropping) begin
rx_byte <= rx_buff;
rx_valid <= 1;
rx_load_cooldown <= RX_LOAD_COOLDOWN;
rx_drained <= 1;
rx_pkt_addr <= rx_pkt_addr + 1;
end else begin
rx_dropping <= 1;
end
if (rx_ready && &rx_pkt_addr) begin
rx_dropping <= 0;
end
end else if (!rx_byte_ready) begin
rx_drained <= 0;
if (rx_load_cooldown == 0) begin
rx_valid <= 0;
end else begin
rx_load_cooldown <= rx_load_cooldown - 1;
end
end
end
end // always_ff @ (posedge sys_clk or rst)
endmodule // rx_hub
module tx_hub(input logic rst,
input logic sys_clk,
output logic tx_active,
input logic tx_loaded,
input logic [7:0] tx_byte,
input logic tx_valid,
output logic tx_ready,
output logic [PACKET_ADDR_LEN - 1:0] tx_pkt_addr,
output logic [QUEUE_ADDR_LEN - 1:0] tx_queue_addr,
input logic [QUEUE_ADDR_LEN - 1:0] tx_new_queue,
input logic tx_new_queue_valid,
output logic tx_new_queue_ready);
timeunit 1ns;
timeprecision 1ps;
shortint tx_load_cooldown;
assign tx_new_queue_ready = !pkt_queue_full;
logic queue_active;
assign tx_active = queue_active || queue_addr_valid;
always_ff @ (posedge sys_clk or rst) begin
if (rst) begin
tx_ready <= 0;
tx_pkt_addr <= '0;
enqueue_addr <= 0;
new_queue_addr <= '0;
request_new_pkt <= 0;
queue_active <= 0;
tx_pkt_addr <= '0;
tx_load_cooldown <= 0;
end else begin // if (rst)
if (tx_new_queue_valid && tx_new_queue_ready) begin
enqueue_addr <= 1;
new_queue_addr <= tx_new_queue;
end else begin
enqueue_addr <= 0;
end
if (!queue_active && queue_addr_valid) begin
request_new_pkt <= 1;
queue_active <= 1;
tx_pkt_addr <= '1;
end else begin
request_new_pkt <= 0;
end
if (queue_active) begin
if (tx_loaded && tx_load_cooldown == 0) begin
tx_ready <= 1;
tx_load_cooldown <= TX_LOAD_COOLDOWN;
tx_pkt_addr <= tx_pkt_addr + 1;
if (&tx_pkt_addr) begin
queue_active <= 0;
end
end else begin
tx_ready <= 0;
if (tx_load_cooldown != 0) begin
tx_load_cooldown <= tx_load_cooldown - 1;
end
end
end else begin // if (queue_active)
tx_pkt_addr <= '1;
if (tx_load_cooldown != 0) begin
tx_load_cooldown <= tx_load_cooldown - 1;
end
end
end // else: !if(rst)
end // always_ff @ (posedge sys_clk or rst)
logic request_new_pkt;
logic [QUEUE_ADDR_LEN - 1:0] new_queue_addr;
logic enqueue_addr;
logic queue_addr_valid;
logic pkt_queue_full;
// IMPORTANT: tx_queue_addr is directly linked
packet_queue pkt_q (.rst(rst),
.sys_clk(sys_clk),
.request_new_packet(request_new_pkt),
.new_queue_addr(new_queue_addr),
.enqueue_addr(enqueue_addr),
.next_queue_addr(tx_queue_addr),
.next_queue_addr_valid(queue_addr_valid),
.queue_full(pkt_queue_full));
endmodule // tx_hub
module async_get_clk_edges(input logic rst,
input logic ext_clk,
input logic sys_clk,
output logic clk_rising_edge,
@ -144,3 +314,45 @@ module async_get_clk_edges(
assign clk_rising_edge = sync_0 & ~sync_1;
assign clk_falling_edge = ~sync_0 & sync_1;
endmodule // async_get_clk_edges
// IMPORTANT: new_queue_addr will always be ready when queue_size is greater than 1
module packet_queue(input logic rst,
input logic sys_clk,
input logic request_new_packet,
input logic [QUEUE_ADDR_LEN - 1:0] new_queue_addr,
input logic enqueue_addr,
output logic [QUEUE_ADDR_LEN - 1:0] next_queue_addr,
output logic next_queue_addr_valid,
output logic queue_full);
timeunit 1ns;
timeprecision 1ps;
logic [QUEUE_ADDR_LEN - 1:0] mem [INTERFACE_QUEUE_SIZE];
logic [INTERFACE_QUEUE_ADDR_LEN - 1:0] head;
logic [INTERFACE_QUEUE_ADDR_LEN - 1:0] tail;
shortint queue_size;
assign queue_full = queue_size == INTERFACE_QUEUE_SIZE[31:16];
assign next_queue_addr_valid = !(queue_size == 0);
always_ff @ (posedge sys_clk or rst) begin
if (rst) begin
head <= '0;
tail <= '0;
queue_size <= 0;
next_queue_addr <= '0;
end else begin
if (request_new_packet) begin
head <= head + 1;
queue_size <= queue_size - 1;
end
next_queue_addr <= mem[head];
if (enqueue_addr) begin
mem[tail] <= new_queue_addr;
tail <= tail + 1;
queue_size <= queue_size + 1;
end
end
end
endmodule // packet_queue

View File

@ -7,6 +7,8 @@ parameter int PACKET_ADDR_LEN = 6;
parameter int ROSE_ADDR_LEN = 8;
parameter logic [PACKET_ADDR_LEN - 1:0] ROSE_DEST_INDEX = 1;
parameter logic [7:0] TX_DEFAULT = 8'b00101010;
parameter shortint QUEUE_SIZE = 1024;
parameter int QUEUE_ADDR_LEN = 10;
parameter int MEMORY_POOL_SIZE = QUEUE_SIZE * PACKET_SIZE;
@ -18,6 +20,7 @@ parameter int INTERFACE_CNT = 4;
parameter int INTERFACE_ADDR_LEN = 2;
parameter int CRC_BITS = 8;
parameter shortint NEW_SLOT_COOLDOWN = 500;
parameter shortint INTERFACE_IDLE_COUNTDOWN = 4;
parameter shortint RX_LOAD_COOLDOWN = 4;
parameter shortint TX_LOAD_COOLDOWN = 4;
`endif