mbed-ce · multiplemonomials · Dec 19, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023
diff --git a/drivers/include/drivers/SPI.h b/drivers/include/drivers/SPI.h
@@ -26,6 +26,7 @@
 #include "drivers/DigitalOut.h"
 #include "platform/SingletonPtr.h"
 #include "platform/NonCopyable.h"
+#include "platform/CacheAlignedBuffer.h"
 
 #if defined MBED_CONF_DRIVERS_SPI_COUNT_MAX && DEVICE_SPI_COUNT > MBED_CONF_DRIVERS_SPI_COUNT_MAX
 #define SPI_PERIPHERALS_USED MBED_CONF_DRIVERS_SPI_COUNT_MAX
@@ -194,6 +195,11 @@ const use_gpio_ssel_t use_gpio_ssel;
  * the transfer but others can execute).  Here's a sample of how to send the same data as above
  * using the blocking async API:</p>
  *
+ * <p>Note that when using the asynchronous API, you must use the CacheAlignedBuffer class when declaring the
+ * receive buffer.  This is because some processors' async SPI implementations require the received buffer to
+ * be at an address which is aligned to the processor cache line size.  CacheAlignedBuffer takes care of this
+ * for you and provides functions (data(), begin(), end()) to access the underlying data in the buffer.</p>
+ *
  * @code
  * #include "mbed.h"
  *
@@ -203,8 +209,8 @@ const use_gpio_ssel_t use_gpio_ssel;
  *     device.format(8, 0);
  *
  *     uint8_t command[2] = {0x0A, 0x0B};
- *     uint8_t response[2];
- *     int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(response));
+ *     CacheAlignedBuffer<uint8_t, 2> response;
+ *     int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(command));
  * }
  * @endcode
  *
@@ -458,8 +464,9 @@ class SPI : private NonCopyable<SPI> {
      * @param tx_buffer The TX buffer with data to be transferred. If NULL is passed,
      *                  the default %SPI value is sent.
      * @param tx_length The length of TX buffer in bytes.
-     * @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
-     *                  received data are ignored.
+     * @param rx_buffer The RX buffer which is used for received data.  Rather than a C array, a CacheAlignedBuffer
+     *                  structure must be passed so that cache alignment can be handled for data received from DMA.
+     *                  May be nullptr if rx_length is 0.
      * @param rx_length The length of RX buffer in bytes.
      * @param callback  The event callback function.
      * @param event     The logical OR of events to subscribe to.  May be #SPI_EVENT_ALL, or some combination
@@ -471,16 +478,18 @@ class SPI : private NonCopyable<SPI> {
      */
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
-        return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
+        return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
     }
 
     // Overloads of the above to support passing nullptr
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
         return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
     }
     template<typename WordT>
@@ -502,8 +511,10 @@ class SPI : private NonCopyable<SPI> {
      * Internally, the chip vendor may implement this function using either DMA or interrupts.
      *
      * @param tx_buffer The TX buffer with data to be transferred.  May be nullptr if tx_length is 0.
-     * @param tx_length The length of TX buffer in bytes.  If 0, no transmission is done.
-     * @param rx_buffer The RX buffer, which is used for received data.  May be nullptr if tx_length is 0.
+     * @param tx_length The length of TX buffer in bytes.  If 0, the default %SPI data value is sent when receiving data.
+     * @param rx_buffer The RX buffer which is used for received data.  Rather than a C array, a CacheAlignedBuffer
+     *                  structure must be passed so that cache alignment can be handled for data received from DMA.
+     *                  May be nullptr if rx_length is 0.
      * @param rx_length The length of RX buffer in bytes  If 0, no reception is done.
      * @param timeout timeout value.  Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
      *
@@ -515,17 +526,19 @@ class SPI : private NonCopyable<SPI> {
      */
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
-        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
+        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
 
     // Overloads of the above to support passing nullptr
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
-        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
+        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
@@ -574,7 +587,8 @@ class SPI : private NonCopyable<SPI> {
      *                  the default SPI value is sent
      * @param tx_length The length of TX buffer in bytes.
      * @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
-     *                  received data are ignored.
+     *                  received data are ignored.  This buffer is guaranteed to be cache aligned
+     *                  if the MCU has a cache.
      * @param rx_length The length of RX buffer in bytes.
      * @param callback  The event callback function.
      * @param event     The event mask of events to modify.
@@ -599,6 +613,7 @@ class SPI : private NonCopyable<SPI> {
      * @param tx_buffer The TX buffer with data to be transferred.  May be nullptr if tx_length is 0.
      * @param tx_length The length of TX buffer in bytes.  If 0, no transmission is done.
      * @param rx_buffer The RX buffer, which is used for received data.  May be nullptr if tx_length is 0.
+     *     This buffer is guaranteed to be cache aligned if the MCU has a cache.
      * @param rx_length The length of RX buffer in bytes  If 0, no reception is done.
      * @param timeout timeout value.  Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
      *
@@ -722,6 +737,18 @@ class SPI : private NonCopyable<SPI> {
      * iff start_transfer() has been called and the chip has been selected but irq_handler_asynch()
      * has NOT been called yet. */
     volatile bool _transfer_in_progress = false;
+
+    // If there is a transfer in progress, this indicates whether it used DMA and therefore requires a cache
+    // flush at the end
+    bool _transfer_in_progress_uses_dma;
+
+#if __DCACHE_PRESENT
+    // These variables store the location and length in bytes of the Rx buffer if an async transfer
+    // is in progress.  They are used for invalidating the cache after the transfer completes.
+    void *_transfer_in_progress_rx_buffer;
+    size_t _transfer_in_progress_rx_len;
+#endif
+
     /* Event flags used for transfer_and_wait() */
     rtos::EventFlags _transfer_and_wait_flags;
 #endif // DEVICE_SPI_ASYNCH

diff --git a/drivers/source/SPI.cpp b/drivers/source/SPI.cpp
@@ -396,6 +396,16 @@ void SPI::abort_transfer()
             _set_ssel(1);
         }
 
+#if __DCACHE_PRESENT
+        if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
+            // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
+            // We only want to do this if DMA actually got used for the transfer because, if interrupts
+            // were used instead, the cache might have the correct data and NOT the main memory.
+            SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
+        }
+#endif
+        _transfer_in_progress = false;
+
 #if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
         dequeue_transaction();
 #endif
@@ -466,8 +476,31 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,
 
     _callback = callback;
     _irq.callback(&SPI::irq_handler_asynch);
+
+#if __DCACHE_PRESENT
+    // On devices with a cache, we need to carefully manage the Tx and Rx buffer cache invalidation.
+    // We can assume that asynchronous SPI implementations might rely on DMA, and that DMA will
+    // not interoperate with the CPU cache.  So, manual flushing/invalidation will be required.
+    // This page is very useful for how to do this correctly:
+    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+    if (tx_length > 0) {
+        // For chips with a cache, we need to evict the Tx data from cache to main memory.
+        // This ensures that the DMA controller can see the most up-to-date copy of the data.
+        SCB_CleanDCache_by_Addr(const_cast<void *>(tx_buffer), tx_length);
+    }
+
+    // Additionally, we have to make sure that there aren't any pending changes which could be written back
+    // to the Rx buffer memory by the cache at a later date, corrupting the DMA results.
+    if (rx_length > 0) {
+        SCB_InvalidateDCache_by_Addr(rx_buffer, rx_length);
+    }
+    _transfer_in_progress_rx_buffer = rx_buffer;
+    _transfer_in_progress_rx_len = rx_length;
+#endif
+
     _transfer_in_progress = true;
-    spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
+
+    _transfer_in_progress_uses_dma = spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
 }
 
 void SPI::lock_deep_sleep()
@@ -508,7 +541,17 @@ void SPI::dequeue_transaction()
 void SPI::irq_handler_asynch(void)
 {
     int event = spi_irq_handler_asynch(&_peripheral->spi);
-    if (_callback && (event & SPI_EVENT_ALL)) {
+    if ((event & SPI_EVENT_ALL)) {
+
+#if __DCACHE_PRESENT
+        if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
+            // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
+            // We only want to do this if DMA actually got used for the transfer because, if interrupts
+            // were used instead, the cache might have the correct data and NOT the main memory.
+            SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
+        }
+#endif
+
         // If using GPIO CS mode, unless we were asked to keep the peripheral selected, deselect it.
         // If there's another transfer queued, we *do* want to deselect the peripheral now.
         // It will be reselected in start_transfer() which is called by dequeue_transaction() below.
@@ -518,7 +561,10 @@ void SPI::irq_handler_asynch(void)
         _transfer_in_progress = false;
 
         unlock_deep_sleep();
-        _callback.call(event & SPI_EVENT_ALL);
+
+        if (_callback) {
+            _callback.call(event & SPI_EVENT_ALL);
+        }
     }
 #if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
     if (event & (SPI_EVENT_ALL | SPI_EVENT_INTERNAL_TRANSFER_COMPLETE)) {

diff --git a/hal/include/hal/spi_api.h b/hal/include/hal/spi_api.h
@@ -25,6 +25,8 @@
 #include "hal/dma_api.h"
 #include "hal/buffer.h"
 
+#include <stdbool.h>
+
 #if DEVICE_SPI
 
 /**
@@ -393,7 +395,11 @@ const PinMap *spi_slave_cs_pinmap(void);
  * @{
  */
 
-/** Begin the SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff
+/** Begin the asynchronous SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff.
+ *
+ * If the device has a data cache, the Tx data is guaranteed to have been flushed from the cache
+ * to main memory already.  Additionally, the Rx buffer is guaranteed to be cache aligned, and will
+ * be invalidated by the SPI layer after the transfer is complete.
  *
  * @param[in] obj       The SPI object that holds the transfer information
  * @param[in] tx        The transmit buffer
@@ -404,8 +410,16 @@ const PinMap *spi_slave_cs_pinmap(void);
  * @param[in] event     The logical OR of events to be registered
  * @param[in] handler   SPI interrupt handler
  * @param[in] hint      A suggestion for how to use DMA with this transfer
+ *
+ * @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
+ * method is used to do the transfer).
+ *
+ * @note On MCUs with a data cache, the return value is used to determine if a cache invalidation needs to be done
+ * after the transfer is complete.  If this function returns true, the driver layer will cache invalidate the Rx buffer under
+ * the assumption that the data needs to be re-read from main memory.  Be careful, because if the read was not actually
+ * done by DMA, and the rx data is in the CPU cache, this invalidation will corrupt it.
  */
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
 
 /** The asynchronous IRQ handler
  *