From 1f7c47a1fb777eaa42e9d4c238b9b91bbed9b787 Mon Sep 17 00:00:00 2001
From: Peisong Xiao <peisong.xiao.xps@gmail.com>
Date: Sat, 17 May 2025 22:06:41 -0400
Subject: [PATCH] WORKING PROGRESS. revamped the files and naming, so git is a
 bit confused mem_hub.sv -> hub.sv spi_slave.sv -> interface.sv

---
 devlog/2025-05-17-Routing-logic.md | 120 ++++++++++++++++++
 fabric/src/hub.sv                  |  84 +++++++++++++
 fabric/src/interface.sv            | 188 +++++++++++++++++++++++++++++
 fabric/src/mem_hub.sv              |  43 -------
 fabric/src/spi_slave.sv            | 106 ----------------
 5 files changed, 392 insertions(+), 149 deletions(-)
 create mode 100644 devlog/2025-05-17-Routing-logic.md
 create mode 100644 fabric/src/hub.sv
 create mode 100644 fabric/src/interface.sv
 delete mode 100644 fabric/src/mem_hub.sv
 delete mode 100644 fabric/src/spi_slave.sv

diff --git a/devlog/2025-05-17-Routing-logic.md b/devlog/2025-05-17-Routing-logic.md
new file mode 100644
index 0000000..45820da
--- /dev/null
+++ b/devlog/2025-05-17-Routing-logic.md
@@ -0,0 +1,120 @@
+# Routing Logic
+Date: 2025-05-17
+
+## Goals and expectations
+The flu is mostly gone from me, so I expected to get some work done.
+The first is to complete the core routing logic (still, no congestion
+considered, if a TX buffer is full, then it'll just drop the data
+silently).
+
+## Thought train
+The RX queue should also be able to send high-priority messages to the
+interface if it's congested because the routing logic is congested.
+There can even be congestion management methods developed based on how
+full the RX queue is relative to the packet sizes and the number of
+connected devices.
+
+## Results
+### Trivial (not really)
+Due to the younger me being blind and used `verilog-mode`'s default
+3-space indentation, the files have been revamped to use 4 spaces for
+indentations.  And I also renamed some files and modules.
+
+Indentation is pretty relevant in good code, 3 spaces is probably more
+evil than using tabs.
+
+### Completed routing logic
+I had the idea in mind to stream packets directly without any buffer
+involved, but to simplify the round-robin logic (and allowing
+potentially multiple streams of data), I went with a small buffer to
+absorb 1 byte from the interfaces.
+
+I did re-rethink the routing logic: all interfaces can send incoming
+data at the routing logic, so that I don't have to deal with
+sync-related issues on the RX side of things, and put the service
+buffer inside of the routing logic to work:
+
+On the RX side, if the buffer for that interface (1 byte) is full or
+going to be filled, turn off `rx_ready`.  And if the buffer is empty,
+it moves the data received from the interface into the service buffer.
+
+On the TX side, I implemented a round-robin approach and only service
+one buffer at any given time.  If the destination is ready, send the
+byte and set `rx_ready` to true.  Also note that due to only servicing
+1 TX queue at any given time, I have to update the `tx_valid` bit of
+the last destination to avoid sending duplicates.
+
+#### Potential problem
+The `rx_ready` design is currently under evaluation, I have two
+approaches in mind, one is the safe one by assigning `rx_ready =
+~in_buffer` which would definitely solve any kind of problems related
+to an interface sending when the buffer isn't ready.  But this would
+mean skipping cycles when 1 interface can directly stream to another.
+
+Then there's the option of only turning off `rx_ready` when the
+interface is trying to write to a full buffer, but the incoming byte
+still stays in a register and hence enabling continuous streaming.
+
+However, since we're polling from any specific buffer only once every
+4 cycles, that means skipping 1 cycle for the RX side of 1 interface
+is trivial.  So, I went with the first approach.
+
+However, this gave me inspiration for another thing: I can allow a
+direct stream mode so that one device can just stream to another,
+better yet, I can also use a shared pool of memory to avoid any kind
+of streaming, although that will significantly impact the logic
+involved and reduce queue size flexibility.
+
+### BRAM access
+Exciting stuff, finally getting into BRAM land, a 1 cycle delay is
+acceptable when the logic itself is running faster than the interface.
+I learned about how to safely access it (within the same clock domain
+of course, that's why there's an RX buffer), and wrote some logic for
+the RX buffer (incomplete).
+
+## Reflections
+1. Trade-offs are being made.  If I construct more complex logic, then
+   I can eliminate the need for a central routing logic for data, but
+   there's a few catches to that: 1. the memory management would be
+   more complex, although it would allow more flexible memory
+   allocation and handle bursts better, but it would also mean having
+   4 smaller queues inside of a bigger memory pool and using a memory
+   collection queue to keep track of which buffers are
+   empty; 2. If one interface is being congested, that means the
+   entire fabric is probably going to be congested.
+   - As always, there's the design of using reserved queues for each
+     interface and a shared central buffer for handling bursts.  BUT
+     WITH EVEN MORE COMPLEXITY!
+2. Reworking the design is acceptable, but I should still keep track
+   of all of my ideas just in case I want to go back to them one day.
+   A lot of things came up as I gathered my thoughts for this devlog,
+   combining unimplemented ideas and my current implementation.  Best
+   to save this devlog for future references.
+3. FPGAs are restricting, but as I dug deeper into constructing logic
+   for it, I felt as inspired as I first found out how programming is
+   like teaching a child to do everything as explicitly and as
+   accurately as you can.
+4. Ideas are cheaper than implementation, that doesn't mean they'll
+   stay.  Keep track of the ideas.
+   
+## Lessons learned
+1. Respect the hardware.  Get to know it more, like how BRAM access
+   has a 1 cycle delay and how non-BRAM variables will use up your
+   LUTs.
+2. There's multiple ways to do things, weigh them carefully and
+   decide what to do with them.  They can be ditched, implemented, or
+   saved for the future.
+3. Rethink and connect.  One design choice can lead to another, one
+   idea can be combined with another.  Go back to previous thoughts
+   and think about how you can refine the current implementation by
+   taking a page out of those past books.
+
+## Final thoughts
+As I continue working on ROSE, I see more of its potential, and I can
+see that I'm making steps to realizing many of them.
+
+I might write down all of my ideas for someone (perhaps me in a more
+distant future) to implement all of them.
+
+## Next steps
+Complete the RX and TX queues, and test them out on a testbench.
diff --git a/fabric/src/hub.sv b/fabric/src/hub.sv
new file mode 100644
index 0000000..b5e09b9
--- /dev/null
+++ b/fabric/src/hub.sv
@@ -0,0 +1,84 @@
+module hub (
+            input logic         rst,
+            input logic         sys_clk,
+            input logic [31:0]  rx_cmd,       // for routing-related commands
+            input logic [3:0]   rx_cmd_valid,
+            input logic [31:0]  rx_byte,
+            input logic [3:0]   rx_valid,
+            input logic [7:0]   rx2tx_dest,   // rx byte's destination
+            input logic [3:0]   tx_ready,     // if tx_byte is ready to be read
+            output logic [3:0]  rx_ready,     // if rx_byte is ready to be read
+            output logic [7:0]  tx_src,       // tell the tx where the stream is comming from
+            output logic [31:0] tx_byte,
+            output logic [3:0]  tx_valid,
+            output logic [1:0]  packet_size); // 4 states for 4 fixed packet sizes
+    timeunit 1ns;
+    timeprecision 1ps;
+    
+    // TBD: pre-agree on packet size
+
+    // use the round-robin strat to poll since the routing is much faster
+    // NOTE: To expand to more connected_devices, use a hierarchical design
+    logic [1:0]           curr_service = 0;
+    logic [1:0]           last_dest = 0;
+
+    // src dest byte
+    typedef struct {
+        logic [1:0] dest;
+        logic [7:0] payload;
+    } svc_buffer;
+    svc_buffer service_buffer [3:0];
+    svc_buffer curr_buffer;
+    assign curr_buffer = service_buffer[curr_service];
+    logic [3:0]           in_buffer;
+    assign rx_ready = ~in_buffer;
+
+    always_ff @ (posedge sys_clk) begin
+        if (rst) begin
+            in_buffer <= '0;
+            tx_src <= '0;
+            tx_valid <= '0;
+            packet_size <= '0;
+            curr_service <= '0;
+            last_dest <= '0;
+            for (int i = 0; i < 4; i++) begin
+                service_buffer[i] <= '0;
+            end
+        end else begin // if (rst)
+            // Handle RX side logic
+            for (int i = 0; i < 4; i++) begin
+                if (rx_valid[i]) begin
+                    if (!in_buffer[i]) begin
+                        service_buffer[i].dest <= get_dest(rx2tx_dest, i[1:0]);
+                        service_buffer[i].payload <= get_byte(rx_byte, i[1:0]);
+                        in_buffer[i] <= 1;
+                    end
+                end
+            end
+
+            // Handle TX side logic
+            if (in_buffer[curr_service] && tx_ready[curr_buffer.dest]) begin
+                tx_byte[{curr_buffer.dest, 3'b000} +: 8]
+                    <= curr_buffer.payload;
+                tx_src[{curr_buffer.dest, 1'b0} +: 2]
+                    <= curr_service;
+                in_buffer[curr_service] <= 0;
+                tx_valid[curr_buffer.dest] <= 1;
+            end
+            tx_valid[last_dest] <= 0;
+            last_dest <= service_buffer[curr_service].dest;
+            curr_service <= curr_service + 1;
+        end // else: !if(rst)
+    end // always_ff @ (posedge sys_clk)
+
+endmodule // hub
+
+function automatic logic [1:0] get_dest(input logic [7:0] dest_map,
+                                        input logic [1:0] idx);
+    return dest_map[{idx, 1'b0} +: 2];
+endfunction // get_dest
+
+function automatic logic [7:0] get_byte(input logic [31:0] byte_arr,
+                                        input logic [1:0] idx);
+    return byte_arr[{idx, 3'b000} +: 8];
+endfunction // get_byte
diff --git a/fabric/src/interface.sv b/fabric/src/interface.sv
new file mode 100644
index 0000000..4753c09
--- /dev/null
+++ b/fabric/src/interface.sv
@@ -0,0 +1,188 @@
+// NOTE: The first byte is used for syncing due to using different clock domains
+`define SYNC_2FF
+module spi_interface(
+                     input logic        rst,
+		     input logic        sys_clk,
+		     input logic        mosi,
+		     input logic        cs,
+		     input logic        sclk,
+                     input logic        rx_ready,
+                     input logic        tx_valid,
+                     input logic [7:0]  tx_byte,
+                     input logic [1:0]  tx_src,
+		     output logic       miso,
+                     output logic       tx_ready,
+                     output logic       rx_valid,
+                     output logic [7:0] rx_byte,
+                     output logic [1:0] rx_dest,
+                     output logic [7:0] rx_cmd,
+                     output logic       rx_cmd_valid);
+    
+    timeunit 1ns;
+    timeprecision 1ps;
+
+    // SPI logic
+    logic sclk_rising_edge;
+    logic sclk_falling_edge;
+    
+    async_get_clk_edges sync (.rst(rst),
+                              .ext_clk(sclk),
+			      .sys_clk(sys_clk),
+			      .clk_rising_edge(sclk_rising_edge),
+			      .clk_falling_edge(sclk_falling_edge));
+    
+    int	       bit_cnt = 0;
+    logic [7:0] rx_shift;
+    logic [7:0] tx_shift = 8'b00101010;
+    logic [7:0] tx_buff = '0;
+    logic       byte_ready = 0;
+
+    always_ff @ (posedge sclk_rising_edge or posedge rst) begin
+        if (rst) begin
+	    rx_shift <= '0;
+	    tx_buff <= '0;
+	    bit_cnt <= '0;
+            byte_ready <= 0;
+        end
+        else begin
+	    if (cs) begin
+	        rx_shift <= 0;
+	        tx_buff <= 0;
+	        bit_cnt <= 0;
+	    end else begin
+	        rx_shift <= {rx_shift[6:0], mosi};
+	        bit_cnt <= bit_cnt + 1;
+	        
+	        if (bit_cnt == 7) begin
+	            bit_cnt <= 0;
+	            tx_buff <= {rx_shift[6:0], mosi};
+                    byte_ready <= 1;
+	        end else
+                    byte_ready <= 0;
+	    end // else: !if(cs)
+        end // else: !if(rst)
+
+        $display("[%0d] current rx_shift: %b", $time, rx_shift);
+        $display("[%0d] current bit_cnt: %0d", $time, bit_cnt);
+        $display("[%0d] current tx_buff:	%b", $time, tx_buff);
+    end // always_ff @ (posedge sclk)
+    
+    always_ff @ (posedge sclk_falling_edge) begin
+        if (rst) begin
+	    tx_shift <= 0;
+        end
+        else begin
+	    if (cs) begin
+	        tx_shift <= 0;
+	    end else begin
+	        if (bit_cnt == 0) begin
+	            tx_shift <= tx_buff[7:0];
+	        end else begin
+	            tx_shift <= {tx_shift[6:0], 1'b0};
+	        end
+	    end
+        end // else: !if(rst)
+        $display("last bit sent: %b", miso);
+        $display("[%0d] current tx_shift: %b", $time, tx_shift);
+        $display("-----------------------------------------");
+    end // always_ff @ (negedge sclk)
+
+    assign miso = tx_shift[7];
+    
+    shortint packet_size = 64;
+    
+    // RX and TX logic
+    logic [9:0] rx_queue_head = 0;
+    logic [9:0] rx_queue_tail = 0;
+    logic [10:0] rx_size = 0;
+    logic        rx_queue_write = 0;
+    logic [7:0]  rx_read;
+    logic        packet_in;
+    logic        rx_queue_empty;
+    assign rx_size = (rx_queue_tail + 11'd1024 - rx_queue_head) & 11'h3FF;
+    assign rx_queue_empty = ~(|rx_size);
+
+    bram_1024B rx_queue (.sys_clk(sys_clk),
+                         .write_enable(rx_queue_write),
+                         .read_addr(rx_queue_head),
+                         .write_addr(rx_queue_tail),
+                         .write_data(tx_buff),
+                         .read_data(rx_read));
+
+    always_ff @ (posedge sys_clk) begin
+        if (rst) begin
+            rx_queue_head <= '0;
+            rx_queue_tail <= '0;
+            rx_queue_write <= '0;
+            rx_read <= '0;
+            packet_in <= 0;
+        end else begin
+            if (byte_ready)
+                rx_queue_write <= 1;
+            else
+                rx_queue_write <= 0;
+            if (!packet_in && rx_size > 2) begin
+                // CONSULT internal routing table for directions
+            end
+        end
+    end
+    
+endmodule // spi_interface
+
+module async_get_clk_edges(
+                           input logic  rst,
+			   input logic  ext_clk,
+			   input logic  sys_clk,
+			   output logic clk_rising_edge,
+			   output logic clk_falling_edge);
+    timeunit 1ns;
+    timeprecision 1ps;
+`ifdef SYNC_2FF
+    logic sync_0 = 0;
+    logic sync_1 = 0;
+
+    always_ff @ (posedge sys_clk) begin
+        if (rst) begin
+            sync_0 <= 0;
+            sync_1 <= 0;
+        end else begin
+            sync_0 <= ext_clk;
+            sync_1 <= sync_0;
+        end
+    end
+
+    assign clk_rising_edge = sync_0 & ~sync_1;
+    assign clk_falling_edge = ~sync_0 & sync_1;
+`else // !`ifdef SYNC_2FF
+    logic [2:0] clk_sync = 0;
+
+    always_ff @ (posedge sys_clk) begin
+        if (rst) 
+            clk_sync <= {clk_sync[1:0], ext_clk};
+    end
+
+    assign clk_rising_edge = (clk_sync[2:1] == 2'b01);
+    assign clk_falling_edge = (clk_sync[2:1] == 2'b10);
+`endif // !`ifdef SYNC_2FF
+endmodule // async_get_clk_edges
+
+module bram_1024B (
+                   input logic        sys_clk,
+                   input logic        write_enable,
+                   input logic [9:0]  read_addr,
+                   input logic [9:0]  write_addr,
+                   input logic [7:0]  write_data,
+                   output logic [7:0] read_data);
+    timeunit 1ns;
+    timeprecision 1ps;
+
+
+    logic [7:0] mem [0:1023];
+
+    always_ff @(posedge sys_clk) begin
+        if (write_enable)
+            mem[write_addr] <= write_data;
+        read_data <= mem[read_addr];
+    end
+
+endmodule // bram_1024B
diff --git a/fabric/src/mem_hub.sv b/fabric/src/mem_hub.sv
deleted file mode 100644
index 7e8138d..0000000
--- a/fabric/src/mem_hub.sv
+++ /dev/null
@@ -1,43 +0,0 @@
-module mem_hub (input logic             rst,
-                 input logic             sys_clk,
-                 input logic [3:0]       connected_devices, // manually configured
-                 input logic [3:0][7:0]  rx_cmd,       // for routing-related commands
-                 input logic [3:0]       rx_cmd_valid,
-                 input logic [3:0][7:0]  rx_byte,
-                 input logic [3:0]       rx_valid,
-                 input logic [3:0][1:0]  rx2tx_dest,   // rx byte's destination
-                 input logic [3:0]       tx_ready,     // if tx_byte was read
-                 output logic [3:0]      rx_ready,     // if rx_byte was read
-                 output logic [3:0][1:0] tx_src,       // tell the tx where the stream is comming from
-                 output logic [3:0][7:0] tx_byte,
-                 output logic [3:0] tx_valid,
-                 output logic [1:0] packet_size); // 4 states for 4 fixed packet sizes
-   timeunit 1ns;
-   timeprecision 1ps;
-   
-   // TBD: pre-agree on packet size
-
-   // use the round-robin strat to poll since the routing is much faster
-   // NOTE: To expand to more connected_devices, use a hierarchical design
-   logic [1:0]           curr_service = 0;
-
-   // src dest byte
-   logic [1:0][1:0][7:0] service_buffer;
-   logic [3:0]           in_buffer;
-
-   // Core service logic
-   always_ff @ (posedge sys_clk) begin
-       if (rst) begin
-           rx_ready <= '1;
-           tx_src <= '0;
-           tx_valid <= '0;
-           packet_size <= '0;
-           service_buffer <= '0;
-           curr_service <= '0;
-       end else if (rx_valid[curr_service]) begin
-           
-       end
-       
-       curr_service <= curr_service + 1;
-   end
-endmodule // mem_hub
diff --git a/fabric/src/spi_slave.sv b/fabric/src/spi_slave.sv
deleted file mode 100644
index 623f4c3..0000000
--- a/fabric/src/spi_slave.sv
+++ /dev/null
@@ -1,106 +0,0 @@
-// NOTE: The first byte is used for syncing due to using different clock domains
-`define SYNC_2FF
-module spi_slave(
-		 input logic  sys_clk,
-		 input logic  mosi,
-		 input logic  cs,
-		 input logic  sclk,
-		 input logic  rst,
-		 output logic miso);
-   timeunit 1ns;
-   timeprecision 1ps;
-
-   logic sclk_rising_edge;
-   logic sclk_falling_edge;
-   
-   async_get_clk_edges sync (.ext_clk(sclk),
-			     .sys_clk(sys_clk),
-			     .clk_rising_edge(sclk_rising_edge),
-			     .clk_falling_edge(sclk_falling_edge));
-
-   int	       bit_cnt = 0;
-   logic [7:0] rx_shift;
-   logic [7:0] tx_shift = 8'b00101010;
-   logic [8:0] tx_buff = '0;
-   logic       byte_ready = 0;
-
-   always_ff @ (posedge sclk_rising_edge or posedge rst) begin
-       if (rst) begin
-	   rx_shift <= 0;
-	   tx_buff <= 0;
-	   bit_cnt <= 0;
-       end
-       else begin
-	   if (cs) begin
-	       rx_shift <= 0;
-	       tx_buff <= 0;
-	       bit_cnt <= 0;
-	   end else begin
-	       rx_shift <= {rx_shift[6:0], mosi};
-	       bit_cnt <= bit_cnt + 1;
-	       
-	       if (bit_cnt == 7) begin
-	           bit_cnt <= 0;
-	           tx_buff <= {rx_shift[6:0], mosi} + 1;
-	       end
-	   end // else: !if(cs)
-       end // else: !if(rst)
-
-       $display("[%0d] current rx_shift: %b", $time, rx_shift);
-       $display("[%0d] current bit_cnt: %0d", $time, bit_cnt);
-       $display("[%0d] current tx_buff:	%b", $time, tx_buff);
-   end // always_ff @ (posedge sclk)
-   
-   always_ff @ (posedge sclk_falling_edge) begin
-       if (rst) begin
-	   tx_shift <= 0;
-       end
-       else begin
-	   if (cs) begin
-	       tx_shift <= 0;
-	   end else begin
-	       if (bit_cnt == 0) begin
-	           tx_shift <= tx_buff[7:0];
-	       end else begin
-	           tx_shift <= {tx_shift[6:0], 1'b0};
-	       end
-	   end
-       end // else: !if(rst)
-       $display("last bit sent: %b", miso);
-       $display("[%0d] current tx_shift: %b", $time, tx_shift);
-       $display("-----------------------------------------");
-   end // always_ff @ (negedge sclk)
-
-   assign miso = tx_shift[7];
-   
-endmodule // spi_slave
-
-module async_get_clk_edges(
-			   input logic	ext_clk,
-			   input logic	sys_clk,
-			   output logic	clk_rising_edge,
-			   output logic	clk_falling_edge);
-   timeunit 1ns;
-   timeprecision 1ps;
-`ifdef SYNC_2FF
-   logic sync_0;
-   logic sync_1;
-
-   always_ff @ (posedge sys_clk) begin
-       sync_0 <= ext_clk;
-       sync_1 <= sync_0;
-   end
-
-   assign clk_rising_edge = sync_0 & ~sync_1;
-   assign clk_falling_edge = ~sync_0 & sync_1;
-`else // !`ifdef SYNC_2FF
-   logic [2:0] clk_sync;
-
-   always_ff @ (posedge sys_clk) begin
-       clk_sync <= {clk_sync[1:0], ext_clk};
-   end
-
-   assign clk_rising_edge = (clk_sync[2:1] == 2'b01);
-   assign clk_falling_edge = (clk_sync[2:1] == 2'b10);
-`endif // !`ifdef SYNC_2FF
-endmodule