From 1f7c47a1fb777eaa42e9d4c238b9b91bbed9b787 Mon Sep 17 00:00:00 2001 From: Peisong Xiao Date: Sat, 17 May 2025 22:06:41 -0400 Subject: [PATCH] WORKING PROGRESS. revamped the files and naming, so git is a bit confused mem_hub.sv -> hub.sv spi_slave.sv -> interface.sv --- devlog/2025-05-17-Routing-logic.md | 120 ++++++++++++++++++ fabric/src/hub.sv | 84 +++++++++++++ fabric/src/interface.sv | 188 +++++++++++++++++++++++++++++ fabric/src/mem_hub.sv | 43 ------- fabric/src/spi_slave.sv | 106 ---------------- 5 files changed, 392 insertions(+), 149 deletions(-) create mode 100644 devlog/2025-05-17-Routing-logic.md create mode 100644 fabric/src/hub.sv create mode 100644 fabric/src/interface.sv delete mode 100644 fabric/src/mem_hub.sv delete mode 100644 fabric/src/spi_slave.sv diff --git a/devlog/2025-05-17-Routing-logic.md b/devlog/2025-05-17-Routing-logic.md new file mode 100644 index 0000000..45820da --- /dev/null +++ b/devlog/2025-05-17-Routing-logic.md @@ -0,0 +1,120 @@ +# Routing Logic +Date: 2025-05-17 + +## Goals and expectations +The flu is mostly gone from me, so I expected to get some work done. +The first is to complete the core routing logic (still, no congestion +considered, if a TX buffer is full, then it'll just drop the data +silently). + +## Thought train +The RX queue should also be able to send high-priority messages to the +interface if it's congested because the routing logic is congested. +There can even be congestion management methods developed based on how +full the RX queue is relative to the packet sizes and the number of +connected devices. + +## Results +### Trivial (not really) +Due to the younger me being blind and used `verilog-mode`'s default +3-space indentation, the files have been revamped to use 4 spaces for +indentations. And I also renamed some files and modules. + +Indentation is pretty relevant in good code, 3 spaces is probably more +evil than using tabs. + +### Completed routing logic +I had the idea in mind to stream packets directly without any buffer +involved, but to simplify the round-robin logic (and allowing +potentially multiple streams of data), I went with a small buffer to +absorb 1 byte from the interfaces. + +I did re-rethink the routing logic: all interfaces can send incoming +data at the routing logic, so that I don't have to deal with +sync-related issues on the RX side of things, and put the service +buffer inside of the routing logic to work: + +On the RX side, if the buffer for that interface (1 byte) is full or +going to be filled, turn off `rx_ready`. And if the buffer is empty, +it moves the data received from the interface into the service buffer. + +On the TX side, I implemented a round-robin approach and only service +one buffer at any given time. If the destination is ready, send the +byte and set `rx_ready` to true. Also note that due to only servicing +1 TX queue at any given time, I have to update the `tx_valid` bit of +the last destination to avoid sending duplicates. + +#### Potential problem +The `rx_ready` design is currently under evaluation, I have two +approaches in mind, one is the safe one by assigning `rx_ready = +~in_buffer` which would definitely solve any kind of problems related +to an interface sending when the buffer isn't ready. But this would +mean skipping cycles when 1 interface can directly stream to another. + +Then there's the option of only turning off `rx_ready` when the +interface is trying to write to a full buffer, but the incoming byte +still stays in a register and hence enabling continuous streaming. + +However, since we're polling from any specific buffer only once every +4 cycles, that means skipping 1 cycle for the RX side of 1 interface +is trivial. So, I went with the first approach. + +However, this gave me inspiration for another thing: I can allow a +direct stream mode so that one device can just stream to another, +better yet, I can also use a shared pool of memory to avoid any kind +of streaming, although that will significantly impact the logic +involved and reduce queue size flexibility. + +### BRAM access +Exciting stuff, finally getting into BRAM land, a 1 cycle delay is +acceptable when the logic itself is running faster than the interface. +I learned about how to safely access it (within the same clock domain +of course, that's why there's an RX buffer), and wrote some logic for +the RX buffer (incomplete). + +## Reflections +1. Trade-offs are being made. If I construct more complex logic, then + I can eliminate the need for a central routing logic for data, but + there's a few catches to that: 1. the memory management would be + more complex, although it would allow more flexible memory + allocation and handle bursts better, but it would also mean having + 4 smaller queues inside of a bigger memory pool and using a memory + collection queue to keep track of which buffers are + empty; 2. If one interface is being congested, that means the + entire fabric is probably going to be congested. + - As always, there's the design of using reserved queues for each + interface and a shared central buffer for handling bursts. BUT + WITH EVEN MORE COMPLEXITY! +2. Reworking the design is acceptable, but I should still keep track + of all of my ideas just in case I want to go back to them one day. + A lot of things came up as I gathered my thoughts for this devlog, + combining unimplemented ideas and my current implementation. Best + to save this devlog for future references. +3. FPGAs are restricting, but as I dug deeper into constructing logic + for it, I felt as inspired as I first found out how programming is + like teaching a child to do everything as explicitly and as + accurately as you can. +4. Ideas are cheaper than implementation, that doesn't mean they'll + stay. Keep track of the ideas. + +## Lessons learned +1. Respect the hardware. Get to know it more, like how BRAM access + has a 1 cycle delay and how non-BRAM variables will use up your + LUTs. +2. There's multiple ways to do things, weigh them carefully and + decide what to do with them. They can be ditched, implemented, or + saved for the future. +3. Rethink and connect. One design choice can lead to another, one + idea can be combined with another. Go back to previous thoughts + and think about how you can refine the current implementation by + taking a page out of those past books. + +## Final thoughts +As I continue working on ROSE, I see more of its potential, and I can +see that I'm making steps to realizing many of them. + +I might write down all of my ideas for someone (perhaps me in a more +distant future) to implement all of them. + +## Next steps +Complete the RX and TX queues, and test them out on a testbench. diff --git a/fabric/src/hub.sv b/fabric/src/hub.sv new file mode 100644 index 0000000..b5e09b9 --- /dev/null +++ b/fabric/src/hub.sv @@ -0,0 +1,84 @@ +module hub ( + input logic rst, + input logic sys_clk, + input logic [31:0] rx_cmd, // for routing-related commands + input logic [3:0] rx_cmd_valid, + input logic [31:0] rx_byte, + input logic [3:0] rx_valid, + input logic [7:0] rx2tx_dest, // rx byte's destination + input logic [3:0] tx_ready, // if tx_byte is ready to be read + output logic [3:0] rx_ready, // if rx_byte is ready to be read + output logic [7:0] tx_src, // tell the tx where the stream is comming from + output logic [31:0] tx_byte, + output logic [3:0] tx_valid, + output logic [1:0] packet_size); // 4 states for 4 fixed packet sizes + timeunit 1ns; + timeprecision 1ps; + + // TBD: pre-agree on packet size + + // use the round-robin strat to poll since the routing is much faster + // NOTE: To expand to more connected_devices, use a hierarchical design + logic [1:0] curr_service = 0; + logic [1:0] last_dest = 0; + + // src dest byte + typedef struct { + logic [1:0] dest; + logic [7:0] payload; + } svc_buffer; + svc_buffer service_buffer [3:0]; + svc_buffer curr_buffer; + assign curr_buffer = service_buffer[curr_service]; + logic [3:0] in_buffer; + assign rx_ready = ~in_buffer; + + always_ff @ (posedge sys_clk) begin + if (rst) begin + in_buffer <= '0; + tx_src <= '0; + tx_valid <= '0; + packet_size <= '0; + curr_service <= '0; + last_dest <= '0; + for (int i = 0; i < 4; i++) begin + service_buffer[i] <= '0; + end + end else begin // if (rst) + // Handle RX side logic + for (int i = 0; i < 4; i++) begin + if (rx_valid[i]) begin + if (!in_buffer[i]) begin + service_buffer[i].dest <= get_dest(rx2tx_dest, i[1:0]); + service_buffer[i].payload <= get_byte(rx_byte, i[1:0]); + in_buffer[i] <= 1; + end + end + end + + // Handle TX side logic + if (in_buffer[curr_service] && tx_ready[curr_buffer.dest]) begin + tx_byte[{curr_buffer.dest, 3'b000} +: 8] + <= curr_buffer.payload; + tx_src[{curr_buffer.dest, 1'b0} +: 2] + <= curr_service; + in_buffer[curr_service] <= 0; + tx_valid[curr_buffer.dest] <= 1; + end + tx_valid[last_dest] <= 0; + last_dest <= service_buffer[curr_service].dest; + curr_service <= curr_service + 1; + end // else: !if(rst) + end // always_ff @ (posedge sys_clk) + +endmodule // hub + +function automatic logic [1:0] get_dest(input logic [7:0] dest_map, + input logic [1:0] idx); + return dest_map[{idx, 1'b0} +: 2]; +endfunction // get_dest + +function automatic logic [7:0] get_byte(input logic [31:0] byte_arr, + input logic [1:0] idx); + return byte_arr[{idx, 3'b000} +: 8]; +endfunction // get_byte diff --git a/fabric/src/interface.sv b/fabric/src/interface.sv new file mode 100644 index 0000000..4753c09 --- /dev/null +++ b/fabric/src/interface.sv @@ -0,0 +1,188 @@ +// NOTE: The first byte is used for syncing due to using different clock domains +`define SYNC_2FF +module spi_interface( + input logic rst, + input logic sys_clk, + input logic mosi, + input logic cs, + input logic sclk, + input logic rx_ready, + input logic tx_valid, + input logic [7:0] tx_byte, + input logic [1:0] tx_src, + output logic miso, + output logic tx_ready, + output logic rx_valid, + output logic [7:0] rx_byte, + output logic [1:0] rx_dest, + output logic [7:0] rx_cmd, + output logic rx_cmd_valid); + + timeunit 1ns; + timeprecision 1ps; + + // SPI logic + logic sclk_rising_edge; + logic sclk_falling_edge; + + async_get_clk_edges sync (.rst(rst), + .ext_clk(sclk), + .sys_clk(sys_clk), + .clk_rising_edge(sclk_rising_edge), + .clk_falling_edge(sclk_falling_edge)); + + int bit_cnt = 0; + logic [7:0] rx_shift; + logic [7:0] tx_shift = 8'b00101010; + logic [7:0] tx_buff = '0; + logic byte_ready = 0; + + always_ff @ (posedge sclk_rising_edge or posedge rst) begin + if (rst) begin + rx_shift <= '0; + tx_buff <= '0; + bit_cnt <= '0; + byte_ready <= 0; + end + else begin + if (cs) begin + rx_shift <= 0; + tx_buff <= 0; + bit_cnt <= 0; + end else begin + rx_shift <= {rx_shift[6:0], mosi}; + bit_cnt <= bit_cnt + 1; + + if (bit_cnt == 7) begin + bit_cnt <= 0; + tx_buff <= {rx_shift[6:0], mosi}; + byte_ready <= 1; + end else + byte_ready <= 0; + end // else: !if(cs) + end // else: !if(rst) + + $display("[%0d] current rx_shift: %b", $time, rx_shift); + $display("[%0d] current bit_cnt: %0d", $time, bit_cnt); + $display("[%0d] current tx_buff: %b", $time, tx_buff); + end // always_ff @ (posedge sclk) + + always_ff @ (posedge sclk_falling_edge) begin + if (rst) begin + tx_shift <= 0; + end + else begin + if (cs) begin + tx_shift <= 0; + end else begin + if (bit_cnt == 0) begin + tx_shift <= tx_buff[7:0]; + end else begin + tx_shift <= {tx_shift[6:0], 1'b0}; + end + end + end // else: !if(rst) + $display("last bit sent: %b", miso); + $display("[%0d] current tx_shift: %b", $time, tx_shift); + $display("-----------------------------------------"); + end // always_ff @ (negedge sclk) + + assign miso = tx_shift[7]; + + shortint packet_size = 64; + + // RX and TX logic + logic [9:0] rx_queue_head = 0; + logic [9:0] rx_queue_tail = 0; + logic [10:0] rx_size = 0; + logic rx_queue_write = 0; + logic [7:0] rx_read; + logic packet_in; + logic rx_queue_empty; + assign rx_size = (rx_queue_tail + 11'd1024 - rx_queue_head) & 11'h3FF; + assign rx_queue_empty = ~(|rx_size); + + bram_1024B rx_queue (.sys_clk(sys_clk), + .write_enable(rx_queue_write), + .read_addr(rx_queue_head), + .write_addr(rx_queue_tail), + .write_data(tx_buff), + .read_data(rx_read)); + + always_ff @ (posedge sys_clk) begin + if (rst) begin + rx_queue_head <= '0; + rx_queue_tail <= '0; + rx_queue_write <= '0; + rx_read <= '0; + packet_in <= 0; + end else begin + if (byte_ready) + rx_queue_write <= 1; + else + rx_queue_write <= 0; + if (!packet_in && rx_size > 2) begin + // CONSULT internal routing table for directions + end + end + end + +endmodule // spi_interface + +module async_get_clk_edges( + input logic rst, + input logic ext_clk, + input logic sys_clk, + output logic clk_rising_edge, + output logic clk_falling_edge); + timeunit 1ns; + timeprecision 1ps; +`ifdef SYNC_2FF + logic sync_0 = 0; + logic sync_1 = 0; + + always_ff @ (posedge sys_clk) begin + if (rst) begin + sync_0 <= 0; + sync_1 <= 0; + end else begin + sync_0 <= ext_clk; + sync_1 <= sync_0; + end + end + + assign clk_rising_edge = sync_0 & ~sync_1; + assign clk_falling_edge = ~sync_0 & sync_1; +`else // !`ifdef SYNC_2FF + logic [2:0] clk_sync = 0; + + always_ff @ (posedge sys_clk) begin + if (rst) + clk_sync <= {clk_sync[1:0], ext_clk}; + end + + assign clk_rising_edge = (clk_sync[2:1] == 2'b01); + assign clk_falling_edge = (clk_sync[2:1] == 2'b10); +`endif // !`ifdef SYNC_2FF +endmodule // async_get_clk_edges + +module bram_1024B ( + input logic sys_clk, + input logic write_enable, + input logic [9:0] read_addr, + input logic [9:0] write_addr, + input logic [7:0] write_data, + output logic [7:0] read_data); + timeunit 1ns; + timeprecision 1ps; + + + logic [7:0] mem [0:1023]; + + always_ff @(posedge sys_clk) begin + if (write_enable) + mem[write_addr] <= write_data; + read_data <= mem[read_addr]; + end + +endmodule // bram_1024B diff --git a/fabric/src/mem_hub.sv b/fabric/src/mem_hub.sv deleted file mode 100644 index 7e8138d..0000000 --- a/fabric/src/mem_hub.sv +++ /dev/null @@ -1,43 +0,0 @@ -module mem_hub (input logic rst, - input logic sys_clk, - input logic [3:0] connected_devices, // manually configured - input logic [3:0][7:0] rx_cmd, // for routing-related commands - input logic [3:0] rx_cmd_valid, - input logic [3:0][7:0] rx_byte, - input logic [3:0] rx_valid, - input logic [3:0][1:0] rx2tx_dest, // rx byte's destination - input logic [3:0] tx_ready, // if tx_byte was read - output logic [3:0] rx_ready, // if rx_byte was read - output logic [3:0][1:0] tx_src, // tell the tx where the stream is comming from - output logic [3:0][7:0] tx_byte, - output logic [3:0] tx_valid, - output logic [1:0] packet_size); // 4 states for 4 fixed packet sizes - timeunit 1ns; - timeprecision 1ps; - - // TBD: pre-agree on packet size - - // use the round-robin strat to poll since the routing is much faster - // NOTE: To expand to more connected_devices, use a hierarchical design - logic [1:0] curr_service = 0; - - // src dest byte - logic [1:0][1:0][7:0] service_buffer; - logic [3:0] in_buffer; - - // Core service logic - always_ff @ (posedge sys_clk) begin - if (rst) begin - rx_ready <= '1; - tx_src <= '0; - tx_valid <= '0; - packet_size <= '0; - service_buffer <= '0; - curr_service <= '0; - end else if (rx_valid[curr_service]) begin - - end - - curr_service <= curr_service + 1; - end -endmodule // mem_hub diff --git a/fabric/src/spi_slave.sv b/fabric/src/spi_slave.sv deleted file mode 100644 index 623f4c3..0000000 --- a/fabric/src/spi_slave.sv +++ /dev/null @@ -1,106 +0,0 @@ -// NOTE: The first byte is used for syncing due to using different clock domains -`define SYNC_2FF -module spi_slave( - input logic sys_clk, - input logic mosi, - input logic cs, - input logic sclk, - input logic rst, - output logic miso); - timeunit 1ns; - timeprecision 1ps; - - logic sclk_rising_edge; - logic sclk_falling_edge; - - async_get_clk_edges sync (.ext_clk(sclk), - .sys_clk(sys_clk), - .clk_rising_edge(sclk_rising_edge), - .clk_falling_edge(sclk_falling_edge)); - - int bit_cnt = 0; - logic [7:0] rx_shift; - logic [7:0] tx_shift = 8'b00101010; - logic [8:0] tx_buff = '0; - logic byte_ready = 0; - - always_ff @ (posedge sclk_rising_edge or posedge rst) begin - if (rst) begin - rx_shift <= 0; - tx_buff <= 0; - bit_cnt <= 0; - end - else begin - if (cs) begin - rx_shift <= 0; - tx_buff <= 0; - bit_cnt <= 0; - end else begin - rx_shift <= {rx_shift[6:0], mosi}; - bit_cnt <= bit_cnt + 1; - - if (bit_cnt == 7) begin - bit_cnt <= 0; - tx_buff <= {rx_shift[6:0], mosi} + 1; - end - end // else: !if(cs) - end // else: !if(rst) - - $display("[%0d] current rx_shift: %b", $time, rx_shift); - $display("[%0d] current bit_cnt: %0d", $time, bit_cnt); - $display("[%0d] current tx_buff: %b", $time, tx_buff); - end // always_ff @ (posedge sclk) - - always_ff @ (posedge sclk_falling_edge) begin - if (rst) begin - tx_shift <= 0; - end - else begin - if (cs) begin - tx_shift <= 0; - end else begin - if (bit_cnt == 0) begin - tx_shift <= tx_buff[7:0]; - end else begin - tx_shift <= {tx_shift[6:0], 1'b0}; - end - end - end // else: !if(rst) - $display("last bit sent: %b", miso); - $display("[%0d] current tx_shift: %b", $time, tx_shift); - $display("-----------------------------------------"); - end // always_ff @ (negedge sclk) - - assign miso = tx_shift[7]; - -endmodule // spi_slave - -module async_get_clk_edges( - input logic ext_clk, - input logic sys_clk, - output logic clk_rising_edge, - output logic clk_falling_edge); - timeunit 1ns; - timeprecision 1ps; -`ifdef SYNC_2FF - logic sync_0; - logic sync_1; - - always_ff @ (posedge sys_clk) begin - sync_0 <= ext_clk; - sync_1 <= sync_0; - end - - assign clk_rising_edge = sync_0 & ~sync_1; - assign clk_falling_edge = ~sync_0 & sync_1; -`else // !`ifdef SYNC_2FF - logic [2:0] clk_sync; - - always_ff @ (posedge sys_clk) begin - clk_sync <= {clk_sync[1:0], ext_clk}; - end - - assign clk_rising_edge = (clk_sync[2:1] == 2'b01); - assign clk_falling_edge = (clk_sync[2:1] == 2'b10); -`endif // !`ifdef SYNC_2FF -endmodule