commit 24bf28db9dc4e838b3c183f3f0cd07ab02e25967 Author: Peisong Xiao Date: Sun May 11 00:35:24 2025 -0400 initial commit: figuring out SPI on the Tang Primer 20K, already 3 devlogs, will commit on a per-devlog/document change basis diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d3b606 --- /dev/null +++ b/README.md @@ -0,0 +1,119 @@ +# ROSE - RDMA Over SPI Engine +A zero-copy, RDMA-inspired transport layer built over SPI, designed +for embedded systems and affordable hardware experimentation. + +ROSE is part of the larger FLORA project, see [link here] for details. + +## What is a rose? +> Roses are elegant. So must be the systems we grow. + +This is the underlying philosophy of ROSE since the day the idea +popped into my head. + +## What is ROSE? +ROSE is an open-source RDMA-inspired data transfer protocol and engine +built over SPI, originally using Raspberry Pi devices and an FPGA +fabric. It simulates key properties of RDMA — low-latency, +memory-mapped, zero-copy semantics — on affordable, widely accessible +hardware. + +## The goal +To explore systems-level design, test automation and test-driven +development cycles, and high-performance data movement through +hardware-software co-design. + +## Getting started +### Requirements +1. Some SBCs with SPI interfaces, preferably running a DMA-enabled SPI + controller (Raspberry Pi's will do just fine) +2. An FPGA with enough pins (5 per device connection) +3. The willingness and courage to tinker with Linux and FPGAs. +4. Some SystemVerilog knowledge (unless you're using the Tang Primer + 20K, which is what I use) +5. A good terminal emulator, a good shell, and a good code editor + (emacs preferred). +### Deploying the masters/controllers on the SBCs +TBD +### Deploying the slaves/peripheral modules on the FPGA fabric +TBD + +## Protocol specifications +See `protocol.md` for details. + +ROSE was designed to embrace newer possibilities as development continues. + +## The planning +See the plan in `plan.md`. + +Most of ROSE's behaviors and features have been planned *before* the +first source file was even created. A good plan serves both as a good +guideline and a good reward mechanism. You'd know early when you're +running into trouble, and you'd know when you've made a solid step in +realizing the project, even if it's simple shift registers to send +back what your device sent. + +> Plans turn fear into focus, risk into reach, and steps into a path. + +When you dream big, use a plan to ground it with smaller, more +manageable structures. And most people like it when their dreams come +true. + +## The action +See the `devlog/` directory for a detailed record of the development +process. + +Writing down what I did, how I did them, what walls I ran into, and +how I can learn from my mistakes helped me realize this project. + +## Why ROSE? +RDMA hardware is quite inaccessible to the average person, at least +when this project started. I want to show that with a few hundred +dollars and some tinkering, I can build a close simulation of +industrial-grade RDMA networks with lower bandwidth and higher +latency, but still outperforming (in specific use cases) the +Ethernet + TCP/IP stack we use day to day. The best way to learn the +ways of the industry is to try to build a miniature version in your +garage. + +### The inspiration +Before planning the project and laying out the roadmap of weeks and +even months of development, I did a co-op at Nokia as a SR Platform +Testing Dev. That allowed me to learn how the networks worked, how +data is transmitted from one end of the world to another (I mostly +worked the network layer and data link layer). That co-op helped me +land me an offer as a Data Center Network Engineer (intern), and also +piqued my interests in the lower levels of how the internet runs as it +is today. + +I did my own research on how to quickly get into the world of data +centers, and came across RDMA and InfiniBand and RoCE. It was love at +first sight. The idea of accessing a piece of data on a remote +machine as if it was on the requesting machine felt like magic. Then +came the cold, harsh truth that my PC simply could not run anything +like that due to hardware and software constraints. So, I sought +another path, picking a good protocol with DMA support and RDMA +potential, and ended up finding SPI. It's not as powerful as messing +around with PCIe lanes, but more cost-effective and easier to +implement, after all, with the right design, this can easily be +migrated to use SERDES or other connections. + +The idea bloomed like a flower, extending its petals, and before I +knew it, I had designed an entire framework with RDMA running on SPI +connections and a test-driven development process (which, with some +more planning, became the foundations of THORN). + +But designing is not enough. Anyone could come up with an idea, some +would even come up with brilliant ideas. I need something concrete to +keep me anchored to my initiative of learning how this technology +works. And ROSE came into being. + +## Special thanks +I'd like to share my gratitude to ChatGPT and other AI-driven tools in +helping to realize this project. I didn't use them to write the +actual code, but I used them to explore my ideas, to plan my path, and +to catch anything that I overlooked in the process. They are powerful +in that way, they can help expand what you have in mind, they can +offer insights into areas that you've never even heard of. And in +that, I'm thankful to the ever-evolving world of technology, and the +countless researchers and their effort to making us live in a better +world. diff --git a/devlog/2025-05-03-SPI-slave-implementation.md b/devlog/2025-05-03-SPI-slave-implementation.md new file mode 100644 index 0000000..671cc9b --- /dev/null +++ b/devlog/2025-05-03-SPI-slave-implementation.md @@ -0,0 +1,120 @@ +# SPI slave implementation on the Tang Primer 20K +Date: 2025-05-03 + +## Goals and expectations +Today's goal will be focused on getting some simple SPI modules on the +Tang Primer 20K running that would receive bits from a master +(raspberry pi) and sends them back, optionally to increment them by +one. + +### Before diving in +- I have not yet formally learned SystemVerilog, perhaps I won't until + the early stages of THORN, since simulations are something of its + concerns. +- Today's development will be in a learn as I go model, this is to + quickly get my hands dirty playing with the FPGA and related things + and to give myself some positive feedback after planning out such a + big plan and getting honestly a bit scared by the details (even + though I set pretty loose deadlines). +- Hopefully, I can figure out how to run their programmer in Linux as + well, and hopefully they provided command-line access to their tool + chain or else I would have to ship the synthesized binary in the + repository so that I can automate flashing it. + +## Results +We didn't get to pushing the logic to the FPGA, but we did get pass +the sim using `verilator` with a `tb` generating the master signal. +This is a great step forward as I implemented it with a buffer in +mind, which would corresponding to buffering an entire ROSE packet to +memory. + +## Reflections +1. Use elegant solutions, if something is ugly, it should not have + been working in the first place. +2. Programming in SystemVerilog is very different from your normal, + sequential programming languages, especially with non-blocking + assignments. + - Non-blocking assignments will run at once after all the logic, + and all of them at once. + - For example: if I want to update a buffer for the result of + incrementing a received buffer upon the reception of the 8th bit, + I would have to write it as `tx_buff <= {rx_shift[6:0], mosi} + + 1`, instead of `tx_buff <= rx_shift + 1`, `rx_shift` hasn't been + updated in that cycle. This actually took me a while to figure + out, that I'm updating the buffer with older results if I use the + latter. +3. **ALWAYS** test with longer tests, and every bit matters. + - For starters, my initial (incorrect) logic ran fine against a + single byte of data, but I didn't notice that I'm actually + updating the buffer at the reception of the 9th bit, which meant + that it quickly fell apart when I tested it against something + like "HELLO" and got gibberish back. It was at that moment I + knew that I had incorrect timing for updating the buffer. + - Then there came the problem with "HELLO", I found that I got + results that were incremented by **2** when I ran the "fixed" + version. While it was tempting to just decrement it by 1 when I + update `tx_shift` and when sending out the first bit of the byte + (since `tx_shift` isn't updated at that time, it would have to + draw that bit from `tx_buff`) or simply not add the 1 to + `tx_buff`. But fixing the logic mattered more, and that's when I + noticed that they all had 0's at their second to last bit for the + entire string. Yup, another sync issue. So, I moved on to + testing with something with more coverage like "ABCD", which + covers the parity. +4. Plan out what to do at each clock edge. Clocks are confusing with + combinational logic, especially paired with SPI's specifications + for using both the rising and falling edges. It took me a while to + realize that the rising edge happens **before** the falling one, + which meant that the data from the rising edge has already been + updated. +5. Use `$display` to poke around the bits and bytes. Although an + oscilloscope might be even better, I should probably set that up. + Displaying debug messages has helped me catch many errors in my + logic (completely avoidable, just a newbie programmer's fault). +6. ChatGPT might not be the best tool to help. I think it did well + helping me plan this project, but not in helping me with the actual + code (according to my philosophy, it never should even try). It + tried to write a 2FF syncing module for the slave module, which is + completely unnecessary since we're directly syncing from the + master's clock. Although that would be helpful later on with + inter-clock domain transactions of data. + - Do your own research, use ChatGPT for suggestions and analyzing + the error messages (thanks to `verilator` for giving me good error + messages and kidnapping me like a rust compiler). + +## Final thoughts +SystemVerilog is confusing. Combinational logic is confusing. +Designing logic and tests within that framework is confusing. +Fuddling with them befuddled me. But it's sweet to see some positive +feedback after all that planning. It's a great start (even if it's +just some simple receive-and-send-back logic), I feel like I'm +actually starting to learn the ropes here, something that I could +never have learned by reading online tutorials or some reference book. + +Even though most of today was me shooting myself in the foot with a +poor understanding of the principles behind SV, it helped me grasp how +the language and what it produces work. It felt like opening myself +to a new domain, where everything can be ran all at once, something +achievable in python or C only with running different threads and +using mutexes to prevent data corruption. It felt like a leap of +faith into abstracting the logic gates but keeping the logic alive. I +still have faith that ROSE will succeed, along with THORN and PETAL. + +I still setup a testbench in ROSE, but after the completion of a +working prototype, it will be migrated/integrated into THORN. + +For now, let the rose sprout on its own and let it gain the momentum +to grow its stems and leaves... + +## The next step +Dump the logic onto the FPGA, and see how it responds to the raspberry +pi. + +Then, try to setup a buffer inside the FPGA's registers to hold +packets. Perhaps try to receive a number of ROSE packets, and then +send them out in reverse order, with their contents modified (like +switching the source and destination fields). + +Might as well as enable UART dumps for debug messages, they would come +in handy once I hand the logic to the FPGA. This is highly optional +for a "next step", but a must in the near-term. diff --git a/devlog/2025-05-04-Stalling.md b/devlog/2025-05-04-Stalling.md new file mode 100644 index 0000000..eed7ce8 --- /dev/null +++ b/devlog/2025-05-04-Stalling.md @@ -0,0 +1,55 @@ +# SPI slave implementation on the Tang Primer 20K +Date: 2025-05-04 + +## Goals and expectations +Since yesterday I completed the simulated runs for the simple SPI +slave's logic, so I wanted to test it out on real hardware to get my +hands dirty, and also try to start adding queues to the logic to +actually start handling traffic and if the timing allows, play around +with UART for a while. + +## Results +Less than appealing, definitely room for improvement. I got stuck +because apparently the `sclk` pin should be connected to a +timing-specific pin if I want to treat the incoming signal as a clock +signal. That turned today into a huge digging into the documentation +for the FPGA for usable pins. And usable `GCLK` pins were hard to +find since I'm using the dock ext. board and it already repurposed +several (most) of them to onboard peripheral ports like Ethernet. I +have exactly 1 pair of `GCLK` pins, for potentially eight connected +RPi's. That's not enough. + +So, I'm going with the other method: we're using an internal +(higher-frequency) clock to capture `sclk` using 2FF synchronizers. +This would come in handy when I start managing different queues for +multiple connections, so today isn't completely stalling. + +At the very least I got some research done on how the pins on the FPGA +are purposed, it's been a great learning experience. + +So, in a word, I did nothing tangible today, but paved the path for +tomorrow. + +## Reflections +1. Get things into action, real action. Sims can only detect logical + errors, but throwing the code into Place and Route and letting the + FPGA-specific tools warn you about the what you're doing wrong is + essential when building this close to actual hardware. + - If yesterday opened my mind to simultaneously running logic, then + today would be me tripping over because I left the safe embrace + of abstracting away the hardware. +2. More knowledge isn't always bad for the project, need more + foresight from now on. + +## Final Thoughts +When you port your code from one Linux machine to another, you'd +expect it to work with some minor tweaks to the dependencies +(unless they touch the kernel), but porting FPGA code from a sim +bed to hardware feels harder than writing the logic itself (on an +unfamiliar platform). + +The future is still looking great. + +## The next step +Same as yesterday, test the logic with hardware, try setting up +buffers, potentially tinker with UART. diff --git a/devlog/2025-05-10-Back-on-track.md b/devlog/2025-05-10-Back-on-track.md new file mode 100644 index 0000000..a77e318 --- /dev/null +++ b/devlog/2025-05-10-Back-on-track.md @@ -0,0 +1,69 @@ +# SPI slave implementation on the Tang Primer 20K +Date: 2025-05-04 + +## Goals and expectations +Get back on track. + +I've been stalling even more due to the co-op term starting and having +a lot to learn every day. But they also gave me a lot of inspiration, +especially DCTCP and some efforts from UET's congestion management +methods, with its adaptive adjusting of transmission windows. + +It feels good to see some more ideas getting integrated into ROSE, but +also daunting because I know if I don't get moving soon, this will end +up in the trash. + +## Results +Decent. Decent progress considering the past week, I managed to solve +the sync issue. + +My first approach was to just use 2 flip-flops to detect the rising +and the falling edges, but somehow the transmission part is delayed by +exactly 1 bit (`sclk` cycle). It's normal for the 2 flip-flop method +to induce a one-time delay of 1-2 cycles (resolvable by dumping the +first byte), but not normal for it to actually be permanently delayed. +It would've been great if I just set `miso` with the second most +significant bit, but that's not a solution if I want to pipe entire +bytes to a queue somewhere else. + +Thanks to the tutorial on https://www.fpga4fun.com/SPI2.html, I +found the problem - I have to prepare the sending bit *before* the +falling edge, and by the time my edge detection has reported a falling +edge, the testbench had already sampled the `miso` line. + +With that out of the way, I can finally start thinking about how to +implement a simple routing logic. + +## Reflections +1. Elegance matters. The first approach was "okay" in the sense that + it actually did what I want, but "not okay" in the sense that it + fits terribly in a system. If it's not elegant, it probably won't + fit. +2. Plan small. I never thought that syncing across clock domains could + be such a problem, and it can even reveal problems in a working + design (i.e. the module I wrote on the first day that uses `sclk` + directly, it tries to assign the new `miso` value upon `negedge + sclk` and worked perfectly fine in the sim). Plan small and hope + that you can do more. +3. Read other people's code, see how other people do it, find good + resources. I originally didn't try to use other people's code + simply because the code I found on GitHub were to generalized and + complex for my current understanding, and that they used Verilog, + which would be pain if I tried to run it in SystemVerilog. But + once I switched the keywords from "SPI SystemVerilog" to "SPI + FPGA", the referenced tutorial came up and it was a lifesaver. + Cleanest code for implementing the features I wanted. +4. Plan well. The first approach was caught as a somehow-running-bug + immediately as I try to integrate it into the larger system. This + is planning helping you navigate the project and signaling pitfalls + very early on. + +## Final thoughts +Hopefully, co-op would ease (as I digest all the incoming info from +all the training), and we make progress. + +I can't imagine how metastability in systems running on higher clocks +could be built if all the people builds stuff like I do ;) + +## The next step +Implement FIFO modules. Try to send a byte stream backwards. diff --git a/docs/specifications.md b/docs/specifications.md new file mode 100644 index 0000000..5f2a5ea --- /dev/null +++ b/docs/specifications.md @@ -0,0 +1,21 @@ +# Protocol Specifications for ROSE + +## Protocol header + +Rose will use a 20-byte header for its packets. The headers contain +the following components: + +1. *CMD* (8 bits): Contains the command to the fabric or + interconnected devices, see *Commands* for details. +2. *SRC* (16 bits): Contains the packet source address. +3. *DEST* (16 bits): contains the packet destination address. +4. *ID* (32 bits): contains a packet identifier unique to the source + in a large time frame. +5. *RESERVED* (64 bits): reserved for future iterations. +6. *CRC* (8 bits): reserved for future iterations incorporating CRC + checksums. +7. *LEN* (16 bits): length of payload. + +## SPI Specifications +We will be using SPI in mode 0, where data is sampled on the falling +edge of the clock pulse and shifted out on the rising edge. diff --git a/fabric/src/.gitignore b/fabric/src/.gitignore new file mode 100644 index 0000000..697cf4f --- /dev/null +++ b/fabric/src/.gitignore @@ -0,0 +1 @@ +obj_dir/ diff --git a/fabric/src/spi_slave.sv b/fabric/src/spi_slave.sv new file mode 100644 index 0000000..5487611 --- /dev/null +++ b/fabric/src/spi_slave.sv @@ -0,0 +1,106 @@ +// NOTE: The first byte is used for syncing due to using different clock domains +`define SYNC_2FF +module spi_slave( + input logic sys_clk, + input logic mosi, + input logic cs, + input logic sclk, + input logic rst, + output logic miso); + timeunit 1ns; + timeprecision 1ps; + + logic sclk_rising_edge; + logic sclk_falling_edge; + + async_get_clk_edges sync (.ext_clk(sclk), + .sys_clk(sys_clk), + .clk_rising_edge(sclk_rising_edge), + .clk_falling_edge(sclk_falling_edge)); + + int bit_cnt = 0; + logic [7:0] rx_shift; + logic [7:0] tx_shift = 8'b00101010; + logic [8:0] tx_buff = '0; + logic byte_ready = 0; + + always_ff @ (posedge sclk_rising_edge or posedge rst) begin + if (rst) begin + rx_shift <= 0; + tx_buff <= 0; + bit_cnt <= 0; + end + else begin + if (cs) begin + rx_shift <= 0; + tx_buff <= 0; + bit_cnt <= 0; + end else begin + rx_shift <= {rx_shift[6:0], mosi}; + bit_cnt <= bit_cnt + 1; + + if (bit_cnt == 7) begin + bit_cnt <= 0; + tx_buff <= {rx_shift[6:0], mosi} + 1; + end + end // else: !if(cs) + end // else: !if(rst) + + $display("[%0d] current rx_shift: %b", $time, rx_shift); + $display("[%0d] current bit_cnt: %0d", $time, bit_cnt); + $display("[%0d] current tx_buff: %b", $time, tx_buff); + end // always_ff @ (posedge sclk) + + always_ff @ (posedge sclk_falling_edge) begin + if (rst) begin + tx_shift <= 0; + end + else begin + if (cs) begin + tx_shift <= 0; + end else begin + if (bit_cnt == 0) begin + tx_shift <= tx_buff[7:0]; + end else begin + tx_shift <= {tx_shift[6:0], 1'b0}; + end + end + end // else: !if(rst) + $display("last bit sent: %b", miso); + $display("[%0d] current tx_shift: %b", $time, tx_shift); + $display("-----------------------------------------"); + end // always_ff @ (negedge sclk) + + assign miso = tx_shift[7]; + +endmodule // spi_slave + +module async_get_clk_edges( + input logic ext_clk, + input logic sys_clk, + output logic clk_rising_edge, + output logic clk_falling_edge); + timeunit 1ns; + timeprecision 1ps; +`ifdef SYNC_2FF + logic sync_0; + logic sync_1; + + always_ff @ (posedge sys_clk) begin + sync_0 <= ext_clk; + sync_1 <= sync_0; + end + + assign clk_rising_edge = sync_0 & ~sync_1; + assign clk_falling_edge = ~sync_0 & sync_1; +`else // !`ifdef SYNC_2FF + logic [2:0] clk_sync; + + always_ff @ (posedge sys_clk) begin + clk_sync <= {clk_sync[1:0], ext_clk}; + end + + assign clk_rising_edge = (clk_sync[2:1] == 2'b01); + assign clk_falling_edge = (clk_sync[2:1] == 2'b10); +`endif // !`ifdef SYNC_2FF +endmodule diff --git a/fabric/src/tb.sv b/fabric/src/tb.sv new file mode 100644 index 0000000..d105029 --- /dev/null +++ b/fabric/src/tb.sv @@ -0,0 +1,58 @@ +module tb; + timeunit 1ns; + timeprecision 1ps; + + logic sys_clk = 0; + + always #3.703 sys_clk = ~sys_clk; + + logic sclk = 0; + logic cs = 1; + logic mosi = 0; + logic miso; + logic rst_n = 0; + string hello = {'b1111_0000_1111_0000, "_ABCD"}; + + spi_slave dut (.sys_clk(sys_clk), + .mosi(mosi), + .cs(cs), + .sclk(sclk), + .miso(miso)); + initial begin + #20 rst_n = 1; + end + + task spi_test (input logic [7:0] tx_byte, + output logic [7:0] rx_byte); + begin + cs = 0; + #30; + for(int idx = 1; idx < hello.len(); idx = idx + 1) begin + tx_byte = hello[idx]; + $display("%0d=================================================", $time); + for (int i = 7; i >= 0; i = i - 1) begin + mosi = tx_byte[i]; + sclk = 1; + #10; + sclk = 0; + rx_byte[i] = miso; + #10; + $display("[%0d] received bit: %b", $time, miso); + end // for (int i = 7; i >= 0; i = i - 1) + $display("[%0d] Sent: %c | %b", $time, tx_byte, tx_byte); + $display("[%0d] Received: %c (expected %c)", $time, rx_byte, hello[idx - 1] + 8'd1); + $display("[%0d] Received: %b (expected %b)", $time, rx_byte, hello[idx - 1] + 8'd1); + $display("%0d=================================================", $time); + end + cs = 1; + #40; + end + endtask // spi_test + + logic [7:0] tx_data = 8'd65; + logic [7:0] rx_data; + + initial begin + spi_test(tx_data, rx_data); + end +endmodule // tb