commit 24bf28db9dc4e838b3c183f3f0cd07ab02e25967
Author: Peisong Xiao <peisong.xiao.xps@gmail.com>
Date:   Sun May 11 00:35:24 2025 -0400

    initial commit: figuring out SPI on the Tang Primer 20K, already 3 devlogs, will commit on a per-devlog/document change basis

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6d3b606
--- /dev/null
+++ b/README.md
@@ -0,0 +1,119 @@
+# ROSE - RDMA Over SPI Engine
+A zero-copy, RDMA-inspired transport layer built over SPI, designed
+for embedded systems and affordable hardware experimentation.
+
+ROSE is part of the larger FLORA project, see [link here] for details.
+
+## What is a rose?
+> Roses are elegant. So must be the systems we grow.
+
+This is the underlying philosophy of ROSE since the day the idea
+popped into my head.
+
+## What is ROSE?
+ROSE is an open-source RDMA-inspired data transfer protocol and engine
+built over SPI, originally using Raspberry Pi devices and an FPGA
+fabric. It simulates key properties of RDMA — low-latency,
+memory-mapped, zero-copy semantics — on affordable, widely accessible
+hardware.
+
+## The goal
+To explore systems-level design, test automation and test-driven
+development cycles, and high-performance data movement through
+hardware-software co-design.
+
+## Getting started
+### Requirements
+1. Some SBCs with SPI interfaces, preferably running a DMA-enabled SPI
+   controller (Raspberry Pi's will do just fine)
+2. An FPGA with enough pins (5 per device connection)
+3. The willingness and courage to tinker with Linux and FPGAs.
+4. Some SystemVerilog knowledge (unless you're using the Tang Primer
+   20K, which is what I use)
+5. A good terminal emulator, a good shell, and a good code editor
+   (emacs preferred).
+### Deploying the masters/controllers on the SBCs
+TBD
+### Deploying the slaves/peripheral modules on the FPGA fabric
+TBD
+
+## Protocol specifications
+See `protocol.md` for details.
+
+ROSE was designed to embrace newer possibilities as development continues.
+
+## The planning
+See the plan in `plan.md`.
+
+Most of ROSE's behaviors and features have been planned *before* the
+first source file was even created.  A good plan serves both as a good
+guideline and a good reward mechanism.  You'd know early when you're
+running into trouble, and you'd know when you've made a solid step in
+realizing the project, even if it's simple shift registers to send
+back what your device sent.
+
+> Plans turn fear into focus, risk into reach, and steps into a path.
+
+When you dream big, use a plan to ground it with smaller, more
+manageable structures.  And most people like it when their dreams come
+true.
+
+## The action
+See the `devlog/` directory for a detailed record of the development
+process.
+
+Writing down what I did, how I did them, what walls I ran into, and
+how I can learn from my mistakes helped me realize this project.
+
+## Why ROSE?
+RDMA hardware is quite inaccessible to the average person, at least
+when this project started.  I want to show that with a few hundred
+dollars and some tinkering, I can build a close simulation of
+industrial-grade RDMA networks with lower bandwidth and higher
+latency, but still outperforming (in specific use cases) the
+Ethernet + TCP/IP stack we use day to day.  The best way to learn the
+ways of the industry is to try to build a miniature version in your
+garage.
+
+### The inspiration
+Before planning the project and laying out the roadmap of weeks and
+even months of development, I did a co-op at Nokia as a SR Platform
+Testing Dev.  That allowed me to learn how the networks worked, how
+data is transmitted from one end of the world to another (I mostly
+worked the network layer and data link layer).  That co-op helped me
+land me an offer as a Data Center Network Engineer (intern), and also
+piqued my interests in the lower levels of how the internet runs as it
+is today.
+
+I did my own research on how to quickly get into the world of data
+centers, and came across RDMA and InfiniBand and RoCE.  It was love at
+first sight.  The idea of accessing a piece of data on a remote
+machine as if it was on the requesting machine felt like magic.  Then
+came the cold, harsh truth that my PC simply could not run anything
+like that due to hardware and software constraints.  So, I sought
+another path, picking a good protocol with DMA support and RDMA
+potential, and ended up finding SPI.  It's not as powerful as messing
+around with PCIe lanes, but more cost-effective and easier to
+implement, after all, with the right design, this can easily be
+migrated to use SERDES or other connections.
+
+The idea bloomed like a flower, extending its petals, and before I
+knew it, I had designed an entire framework with RDMA running on SPI
+connections and a test-driven development process (which, with some
+more planning, became the foundations of THORN).
+
+But designing is not enough.  Anyone could come up with an idea, some
+would even come up with brilliant ideas.  I need something concrete to
+keep me anchored to my initiative of learning how this technology
+works.  And ROSE came into being.
+
+## Special thanks
+I'd like to share my gratitude to ChatGPT and other AI-driven tools in
+helping to realize this project.  I didn't use them to write the
+actual code, but I used them to explore my ideas, to plan my path, and
+to catch anything that I overlooked in the process.  They are powerful
+in that way, they can help expand what you have in mind, they can
+offer insights into areas that you've never even heard of.  And in
+that, I'm thankful to the ever-evolving world of technology, and the
+countless researchers and their effort to making us live in a better
+world.
diff --git a/devlog/2025-05-03-SPI-slave-implementation.md b/devlog/2025-05-03-SPI-slave-implementation.md
new file mode 100644
index 0000000..671cc9b
--- /dev/null
+++ b/devlog/2025-05-03-SPI-slave-implementation.md
@@ -0,0 +1,120 @@
+# SPI slave implementation on the Tang Primer 20K
+Date: 2025-05-03
+
+## Goals and expectations
+Today's goal will be focused on getting some simple SPI modules on the
+Tang Primer 20K running that would receive bits from a master
+(raspberry pi) and sends them back, optionally to increment them by
+one.
+
+### Before diving in
+- I have not yet formally learned SystemVerilog, perhaps I won't until
+  the early stages of THORN, since simulations are something of its
+  concerns.
+- Today's development will be in a learn as I go model, this is to
+  quickly get my hands dirty playing with the FPGA and related things
+  and to give myself some positive feedback after planning out such a
+  big plan and getting honestly a bit scared by the details (even
+  though I set pretty loose deadlines).
+- Hopefully, I can figure out how to run their programmer in Linux as
+  well, and hopefully they provided command-line access to their tool
+  chain or else I would have to ship the synthesized binary in the
+  repository so that I can automate flashing it.
+  
+## Results
+We didn't get to pushing the logic to the FPGA, but we did get pass
+the sim using `verilator` with a `tb` generating the master signal.
+This is a great step forward as I implemented it with a buffer in
+mind, which would corresponding to buffering an entire ROSE packet to
+memory.
+
+## Reflections
+1. Use elegant solutions, if something is ugly, it should not have
+   been working in the first place.
+2. Programming in SystemVerilog is very different from your normal,
+   sequential programming languages, especially with non-blocking
+   assignments.
+   - Non-blocking assignments will run at once after all the logic,
+     and all of them at once.
+   - For example: if I want to update a buffer for the result of
+     incrementing a received buffer upon the reception of the 8th bit,
+     I would have to write it as `tx_buff <= {rx_shift[6:0], mosi} +
+     1`, instead of `tx_buff <= rx_shift + 1`, `rx_shift` hasn't been
+     updated in that cycle.  This actually took me a while to figure
+     out, that I'm updating the buffer with older results if I use the
+     latter.
+3. **ALWAYS** test with longer tests, and every bit matters.
+   - For starters, my initial (incorrect) logic ran fine against a
+     single byte of data, but I didn't notice that I'm actually
+     updating the buffer at the reception of the 9th bit, which meant
+     that it quickly fell apart when I tested it against something
+     like "HELLO" and got gibberish back.  It was at that moment I
+     knew that I had incorrect timing for updating the buffer.
+   - Then there came the problem with "HELLO", I found that I got
+     results that were incremented by **2** when I ran the "fixed"
+     version.  While it was tempting to just decrement it by 1 when I
+     update `tx_shift` and when sending out the first bit of the byte
+     (since `tx_shift` isn't updated at that time, it would have to
+     draw that bit from `tx_buff`) or simply not add the 1 to
+     `tx_buff`.  But fixing the logic mattered more, and that's when I
+     noticed that they all had 0's at their second to last bit for the
+     entire string.  Yup, another sync issue.  So, I moved on to
+     testing with something with more coverage like "ABCD", which
+     covers the parity.
+4. Plan out what to do at each clock edge.  Clocks are confusing with
+   combinational logic, especially paired with SPI's specifications
+   for using both the rising and falling edges.  It took me a while to
+   realize that the rising edge happens **before** the falling one,
+   which meant that the data from the rising edge has already been
+   updated.
+5. Use `$display` to poke around the bits and bytes.  Although an
+   oscilloscope might be even better, I should probably set that up.
+   Displaying debug messages has helped me catch many errors in my
+   logic (completely avoidable, just a newbie programmer's fault).
+6. ChatGPT might not be the best tool to help.  I think it did well
+   helping me plan this project, but not in helping me with the actual
+   code (according to my philosophy, it never should even try).  It
+   tried to write a 2FF syncing module for the slave module, which is
+   completely unnecessary since we're directly syncing from the
+   master's clock.  Although that would be helpful later on with
+   inter-clock domain transactions of data.
+   - Do your own research, use ChatGPT for suggestions and analyzing
+   the error messages (thanks to `verilator` for giving me good error
+   messages and kidnapping me like a rust compiler).
+
+## Final thoughts
+SystemVerilog is confusing.  Combinational logic is confusing.
+Designing logic and tests within that framework is confusing.
+Fuddling with them befuddled me.  But it's sweet to see some positive
+feedback after all that planning.  It's a great start (even if it's
+just some simple receive-and-send-back logic), I feel like I'm
+actually starting to learn the ropes here, something that I could
+never have learned by reading online tutorials or some reference book.
+
+Even though most of today was me shooting myself in the foot with a
+poor understanding of the principles behind SV, it helped me grasp how
+the language and what it produces work.  It felt like opening myself
+to a new domain, where everything can be ran all at once, something
+achievable in python or C only with running different threads and
+using mutexes to prevent data corruption.  It felt like a leap of
+faith into abstracting the logic gates but keeping the logic alive.  I
+still have faith that ROSE will succeed, along with THORN and PETAL.
+
+I still setup a testbench in ROSE, but after the completion of a
+working prototype, it will be migrated/integrated into THORN.
+
+For now, let the rose sprout on its own and let it gain the momentum
+to grow its stems and leaves...
+
+## The next step
+Dump the logic onto the FPGA, and see how it responds to the raspberry
+pi.
+
+Then, try to setup a buffer inside the FPGA's registers to hold
+packets.  Perhaps try to receive a number of ROSE packets, and then
+send them out in reverse order, with their contents modified (like
+switching the source and destination fields).
+
+Might as well as enable UART dumps for debug messages, they would come
+in handy once I hand the logic to the FPGA.  This is highly optional
+for a "next step", but a must in the near-term.
diff --git a/devlog/2025-05-04-Stalling.md b/devlog/2025-05-04-Stalling.md
new file mode 100644
index 0000000..eed7ce8
--- /dev/null
+++ b/devlog/2025-05-04-Stalling.md
@@ -0,0 +1,55 @@
+# SPI slave implementation on the Tang Primer 20K
+Date: 2025-05-04
+
+## Goals and expectations
+Since yesterday I completed the simulated runs for the simple SPI
+slave's logic, so I wanted to test it out on real hardware to get my
+hands dirty, and also try to start adding queues to the logic to
+actually start handling traffic and if the timing allows, play around
+with UART for a while.
+
+## Results
+Less than appealing, definitely room for improvement.  I got stuck
+because apparently the `sclk` pin should be connected to a
+timing-specific pin if I want to treat the incoming signal as a clock
+signal.  That turned today into a huge digging into the documentation
+for the FPGA for usable pins.  And usable `GCLK` pins were hard to
+find since I'm using the dock ext. board and it already repurposed
+several (most) of them to onboard peripheral ports like Ethernet.  I
+have exactly 1 pair of `GCLK` pins, for potentially eight connected
+RPi's.  That's not enough.
+
+So, I'm going with the other method: we're using an internal
+(higher-frequency) clock to capture `sclk` using 2FF synchronizers.
+This would come in handy when I start managing different queues for
+multiple connections, so today isn't completely stalling.
+
+At the very least I got some research done on how the pins on the FPGA
+are purposed, it's been a great learning experience.
+
+So, in a word, I did nothing tangible today, but paved the path for
+tomorrow.
+
+## Reflections
+1. Get things into action, real action.  Sims can only detect logical
+   errors, but throwing the code into Place and Route and letting the
+   FPGA-specific tools warn you about the what you're doing wrong is
+   essential when building this close to actual hardware.
+   - If yesterday opened my mind to simultaneously running logic, then
+     today would be me tripping over because I left the safe embrace
+     of abstracting away the hardware.
+2. More knowledge isn't always bad for the project, need more
+   foresight from now on.
+	 
+## Final Thoughts
+When you port your code from one Linux machine to another, you'd
+expect it to work with some minor tweaks to the dependencies
+(unless they touch the kernel), but porting FPGA code from a sim
+bed to hardware feels harder than writing the logic itself (on an
+unfamiliar platform).
+
+The future is still looking great.
+
+## The next step
+Same as yesterday, test the logic with hardware, try setting up
+buffers, potentially tinker with UART.
diff --git a/devlog/2025-05-10-Back-on-track.md b/devlog/2025-05-10-Back-on-track.md
new file mode 100644
index 0000000..a77e318
--- /dev/null
+++ b/devlog/2025-05-10-Back-on-track.md
@@ -0,0 +1,69 @@
+# SPI slave implementation on the Tang Primer 20K
+Date: 2025-05-04
+
+## Goals and expectations
+Get back on track.
+
+I've been stalling even more due to the co-op term starting and having
+a lot to learn every day.  But they also gave me a lot of inspiration,
+especially DCTCP and some efforts from UET's congestion management
+methods, with its adaptive adjusting of transmission windows.
+
+It feels good to see some more ideas getting integrated into ROSE, but
+also daunting because I know if I don't get moving soon, this will end
+up in the trash.
+
+## Results
+Decent.  Decent progress considering the past week, I managed to solve
+the sync issue.
+
+My first approach was to just use 2 flip-flops to detect the rising
+and the falling edges, but somehow the transmission part is delayed by
+exactly 1 bit (`sclk` cycle).  It's normal for the 2 flip-flop method
+to induce a one-time delay of 1-2 cycles (resolvable by dumping the
+first byte), but not normal for it to actually be permanently delayed.
+It would've been great if I just set `miso` with the second most
+significant bit, but that's not a solution if I want to pipe entire
+bytes to a queue somewhere else.
+
+Thanks to the tutorial on https://www.fpga4fun.com/SPI2.html, I
+found the problem - I have to prepare the sending bit *before* the
+falling edge, and by the time my edge detection has reported a falling
+edge, the testbench had already sampled the `miso` line.
+
+With that out of the way, I can finally start thinking about how to
+implement a simple routing logic.
+
+## Reflections
+1. Elegance matters. The first approach was "okay" in the sense that
+   it actually did what I want, but "not okay" in the sense that it
+   fits terribly in a system.  If it's not elegant, it probably won't
+   fit.
+2. Plan small. I never thought that syncing across clock domains could
+   be such a problem, and it can even reveal problems in a working
+   design (i.e. the module I wrote on the first day that uses `sclk`
+   directly, it tries to assign the new `miso` value upon `negedge
+   sclk` and worked perfectly fine in the sim).  Plan small and hope
+   that you can do more.
+3. Read other people's code, see how other people do it, find good
+   resources.  I originally didn't try to use other people's code
+   simply because the code I found on GitHub were to generalized and
+   complex for my current understanding, and that they used Verilog,
+   which would be pain if I tried to run it in SystemVerilog.  But
+   once I switched the keywords from "SPI SystemVerilog" to "SPI
+   FPGA", the referenced tutorial came up and it was a lifesaver.
+   Cleanest code for implementing the features I wanted.
+4. Plan well.  The first approach was caught as a somehow-running-bug
+   immediately as I try to integrate it into the larger system.  This
+   is planning helping you navigate the project and signaling pitfalls
+   very early on.
+
+## Final thoughts
+Hopefully, co-op would ease (as I digest all the incoming info from
+all the training), and we make progress.
+
+I can't imagine how metastability in systems running on higher clocks
+could be built if all the people builds stuff like I do ;)
+
+## The next step
+Implement FIFO modules.  Try to send a byte stream backwards.
diff --git a/docs/specifications.md b/docs/specifications.md
new file mode 100644
index 0000000..5f2a5ea
--- /dev/null
+++ b/docs/specifications.md
@@ -0,0 +1,21 @@
+# Protocol Specifications for ROSE
+
+## Protocol header
+
+Rose will use a 20-byte header for its packets. The headers contain
+the following components:
+
+1. *CMD* (8 bits): Contains the command to the fabric or
+   interconnected devices, see *Commands* for details.
+2. *SRC* (16 bits): Contains the packet source address.
+3. *DEST* (16 bits): contains the packet destination address.
+4. *ID* (32 bits): contains a packet identifier unique to the source
+   in a large time frame.
+5. *RESERVED* (64 bits): reserved for future iterations.
+6. *CRC* (8 bits): reserved for future iterations incorporating CRC
+   checksums.
+7. *LEN* (16 bits): length of payload.
+
+## SPI Specifications
+We will be using SPI in mode 0, where data is sampled on the falling
+edge of the clock pulse and shifted out on the rising edge.
diff --git a/fabric/src/.gitignore b/fabric/src/.gitignore
new file mode 100644
index 0000000..697cf4f
--- /dev/null
+++ b/fabric/src/.gitignore
@@ -0,0 +1 @@
+obj_dir/
diff --git a/fabric/src/spi_slave.sv b/fabric/src/spi_slave.sv
new file mode 100644
index 0000000..5487611
--- /dev/null
+++ b/fabric/src/spi_slave.sv
@@ -0,0 +1,106 @@
+// NOTE: The first byte is used for syncing due to using different clock domains
+`define SYNC_2FF
+module spi_slave(
+		 input logic  sys_clk,
+		 input logic  mosi,
+		 input logic  cs,
+		 input logic  sclk,
+		 input logic  rst,
+		 output logic miso);
+   timeunit 1ns;
+   timeprecision 1ps;
+
+   logic sclk_rising_edge;
+   logic sclk_falling_edge;
+   
+   async_get_clk_edges sync (.ext_clk(sclk),
+				 .sys_clk(sys_clk),
+				 .clk_rising_edge(sclk_rising_edge),
+				 .clk_falling_edge(sclk_falling_edge));
+
+   int	       bit_cnt = 0;
+   logic [7:0] rx_shift;
+   logic [7:0] tx_shift = 8'b00101010;
+   logic [8:0] tx_buff = '0;
+   logic       byte_ready = 0;
+
+   always_ff @ (posedge sclk_rising_edge or posedge rst) begin
+      if (rst) begin
+	 rx_shift <= 0;
+	 tx_buff <= 0;
+	 bit_cnt <= 0;
+      end
+      else begin
+	 if (cs) begin
+	    rx_shift <= 0;
+	    tx_buff <= 0;
+	    bit_cnt <= 0;
+	 end else begin
+	    rx_shift <= {rx_shift[6:0], mosi};
+	    bit_cnt <= bit_cnt + 1;
+	    
+	    if (bit_cnt == 7) begin
+	       bit_cnt <= 0;
+	       tx_buff <= {rx_shift[6:0], mosi} + 1;
+	    end
+	 end // else: !if(cs)
+      end // else: !if(rst)
+
+      $display("[%0d] current rx_shift: %b", $time, rx_shift);
+      $display("[%0d] current bit_cnt: %0d", $time, bit_cnt);
+      $display("[%0d] current tx_buff:	%b", $time, tx_buff);
+   end // always_ff @ (posedge sclk)
+   
+   always_ff @ (posedge sclk_falling_edge) begin
+      if (rst) begin
+	 tx_shift <= 0;
+      end
+      else begin
+	 if (cs) begin
+	    tx_shift <= 0;
+	 end else begin
+	    if (bit_cnt == 0) begin
+	       tx_shift <= tx_buff[7:0];
+	    end else begin
+	       tx_shift <= {tx_shift[6:0], 1'b0};
+	    end
+	 end
+      end // else: !if(rst)
+      $display("last bit sent: %b", miso);
+      $display("[%0d] current tx_shift: %b", $time, tx_shift);
+      $display("-----------------------------------------");
+   end // always_ff @ (negedge sclk)
+
+   assign miso = tx_shift[7];
+   
+endmodule // spi_slave
+
+module async_get_clk_edges(
+			   input logic	ext_clk,
+			   input logic	sys_clk,
+			   output logic	clk_rising_edge,
+			   output logic	clk_falling_edge);
+   timeunit 1ns;
+   timeprecision 1ps;
+`ifdef SYNC_2FF
+   logic sync_0;
+   logic sync_1;
+
+   always_ff @ (posedge sys_clk) begin
+      sync_0 <= ext_clk;
+      sync_1 <= sync_0;
+   end
+
+   assign clk_rising_edge = sync_0 & ~sync_1;
+   assign clk_falling_edge = ~sync_0 & sync_1;
+`else // !`ifdef SYNC_2FF
+   logic [2:0] clk_sync;
+
+   always_ff @ (posedge sys_clk) begin
+      clk_sync <= {clk_sync[1:0], ext_clk};
+   end
+
+   assign clk_rising_edge = (clk_sync[2:1] == 2'b01);
+   assign clk_falling_edge = (clk_sync[2:1] == 2'b10);
+`endif // !`ifdef SYNC_2FF
+endmodule
diff --git a/fabric/src/tb.sv b/fabric/src/tb.sv
new file mode 100644
index 0000000..d105029
--- /dev/null
+++ b/fabric/src/tb.sv
@@ -0,0 +1,58 @@
+module tb;
+   timeunit 1ns;
+   timeprecision 1ps;
+
+   logic sys_clk = 0;
+
+   always #3.703 sys_clk = ~sys_clk;
+
+   logic sclk = 0;
+   logic cs = 1;
+   logic mosi = 0;
+   logic miso;
+   logic rst_n = 0;
+   string hello = {'b1111_0000_1111_0000, "_ABCD"};
+
+   spi_slave dut (.sys_clk(sys_clk),
+		  .mosi(mosi),
+		  .cs(cs),
+		  .sclk(sclk),
+		  .miso(miso));
+   initial begin
+      #20 rst_n = 1;
+   end
+
+   task spi_test (input logic [7:0] tx_byte,
+		  output logic [7:0] rx_byte);
+      begin
+	 cs = 0;
+	 #30;
+	 for(int idx = 1; idx < hello.len(); idx = idx + 1) begin
+	    tx_byte = hello[idx];
+	    $display("%0d=================================================", $time);
+	    for (int i = 7; i >= 0; i = i - 1) begin
+	       mosi = tx_byte[i];
+               sclk = 1;
+               #10;
+               sclk = 0;
+	       rx_byte[i] = miso;
+	       #10;
+	       $display("[%0d] received bit: %b", $time, miso);
+	    end // for (int i = 7; i >= 0; i = i - 1)
+	    $display("[%0d] Sent: %c  |  %b", $time, tx_byte, tx_byte);
+	    $display("[%0d] Received: %c (expected %c)", $time, rx_byte, hello[idx - 1] + 8'd1);
+	    $display("[%0d] Received: %b (expected %b)", $time, rx_byte, hello[idx - 1] + 8'd1);
+	    $display("%0d=================================================", $time);
+	 end
+	 cs = 1;
+	 #40;
+      end
+   endtask // spi_test
+   
+   logic [7:0] tx_data = 8'd65;
+   logic [7:0] rx_data;
+   
+   initial begin
+      spi_test(tx_data, rx_data);
+   end
+endmodule // tb