Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make DMA SPI driver aware of CPU cache, fix data corruption and other SPI issues on STM32H7 #199

Merged
merged 8 commits into from
Dec 19, 2023
55 changes: 41 additions & 14 deletions drivers/include/drivers/SPI.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "drivers/DigitalOut.h"
#include "platform/SingletonPtr.h"
#include "platform/NonCopyable.h"
#include "platform/CacheAlignedBuffer.h"

#if defined MBED_CONF_DRIVERS_SPI_COUNT_MAX && DEVICE_SPI_COUNT > MBED_CONF_DRIVERS_SPI_COUNT_MAX
#define SPI_PERIPHERALS_USED MBED_CONF_DRIVERS_SPI_COUNT_MAX
Expand Down Expand Up @@ -194,6 +195,11 @@ const use_gpio_ssel_t use_gpio_ssel;
* the transfer but others can execute). Here's a sample of how to send the same data as above
* using the blocking async API:</p>
*
* <p>Note that when using the asynchronous API, you must use the CacheAlignedBuffer class when declaring the
* receive buffer. This is because some processors' async SPI implementations require the received buffer to
* be at an address which is aligned to the processor cache line size. CacheAlignedBuffer takes care of this
* for you and provides functions (data(), begin(), end()) to access the underlying data in the buffer.</p>
*
* @code
* #include "mbed.h"
*
Expand All @@ -203,8 +209,8 @@ const use_gpio_ssel_t use_gpio_ssel;
* device.format(8, 0);
*
* uint8_t command[2] = {0x0A, 0x0B};
* uint8_t response[2];
* int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(response));
* CacheAlignedBuffer<uint8_t, 2> response;
* int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(command));
Ky-Ng marked this conversation as resolved.
Show resolved Hide resolved
* }
* @endcode
*
Expand Down Expand Up @@ -458,8 +464,9 @@ class SPI : private NonCopyable<SPI> {
* @param tx_buffer The TX buffer with data to be transferred. If NULL is passed,
* the default %SPI value is sent.
* @param tx_length The length of TX buffer in bytes.
* @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
* received data are ignored.
* @param rx_buffer The RX buffer which is used for received data. Rather than a C array, a CacheAlignedBuffer
* structure must be passed so that cache alignment can be handled for data received from DMA.
* May be nullptr if rx_length is 0.
* @param rx_length The length of RX buffer in bytes.
* @param callback The event callback function.
* @param event The logical OR of events to subscribe to. May be #SPI_EVENT_ALL, or some combination
Expand All @@ -471,16 +478,18 @@ class SPI : private NonCopyable<SPI> {
*/
template<typename WordT>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
{
return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
}

// Overloads of the above to support passing nullptr
template<typename WordT>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
{
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
}
template<typename WordT>
Expand All @@ -502,8 +511,10 @@ class SPI : private NonCopyable<SPI> {
* Internally, the chip vendor may implement this function using either DMA or interrupts.
*
* @param tx_buffer The TX buffer with data to be transferred. May be nullptr if tx_length is 0.
* @param tx_length The length of TX buffer in bytes. If 0, no transmission is done.
* @param rx_buffer The RX buffer, which is used for received data. May be nullptr if tx_length is 0.
* @param tx_length The length of TX buffer in bytes. If 0, the default %SPI data value is sent when receiving data.
* @param rx_buffer The RX buffer which is used for received data. Rather than a C array, a CacheAlignedBuffer
* structure must be passed so that cache alignment can be handled for data received from DMA.
* May be nullptr if rx_length is 0.
* @param rx_length The length of RX buffer in bytes If 0, no reception is done.
* @param timeout timeout value. Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
*
Expand All @@ -515,17 +526,19 @@ class SPI : private NonCopyable<SPI> {
*/
template<typename WordT>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer_and_wait(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
{
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
}

// Overloads of the above to support passing nullptr
template<typename WordT>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
{
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
}
template<typename WordT>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
Expand Down Expand Up @@ -574,7 +587,8 @@ class SPI : private NonCopyable<SPI> {
* the default SPI value is sent
* @param tx_length The length of TX buffer in bytes.
* @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
* received data are ignored.
* received data are ignored. This buffer is guaranteed to be cache aligned
* if the MCU has a cache.
* @param rx_length The length of RX buffer in bytes.
* @param callback The event callback function.
* @param event The event mask of events to modify.
Expand All @@ -599,6 +613,7 @@ class SPI : private NonCopyable<SPI> {
* @param tx_buffer The TX buffer with data to be transferred. May be nullptr if tx_length is 0.
* @param tx_length The length of TX buffer in bytes. If 0, no transmission is done.
* @param rx_buffer The RX buffer, which is used for received data. May be nullptr if tx_length is 0.
* This buffer is guaranteed to be cache aligned if the MCU has a cache.
* @param rx_length The length of RX buffer in bytes If 0, no reception is done.
* @param timeout timeout value. Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
*
Expand Down Expand Up @@ -722,6 +737,18 @@ class SPI : private NonCopyable<SPI> {
* iff start_transfer() has been called and the chip has been selected but irq_handler_asynch()
* has NOT been called yet. */
volatile bool _transfer_in_progress = false;

// If there is a transfer in progress, this indicates whether it used DMA and therefore requires a cache
// flush at the end
bool _transfer_in_progress_uses_dma;

#if __DCACHE_PRESENT
// These variables store the location and length in bytes of the Rx buffer if an async transfer
// is in progress. They are used for invalidating the cache after the transfer completes.
void *_transfer_in_progress_rx_buffer;
size_t _transfer_in_progress_rx_len;
#endif

/* Event flags used for transfer_and_wait() */
rtos::EventFlags _transfer_and_wait_flags;
#endif // DEVICE_SPI_ASYNCH
Expand Down
52 changes: 49 additions & 3 deletions drivers/source/SPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,16 @@ void SPI::abort_transfer()
_set_ssel(1);
}

#if __DCACHE_PRESENT
if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
// If the cache is present, invalidate the Rx data so it's loaded from main RAM.
// We only want to do this if DMA actually got used for the transfer because, if interrupts
// were used instead, the cache might have the correct data and NOT the main memory.
Ky-Ng marked this conversation as resolved.
Show resolved Hide resolved
SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
}
#endif
_transfer_in_progress = false;

#if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
dequeue_transaction();
#endif
Expand Down Expand Up @@ -466,8 +476,31 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,

_callback = callback;
_irq.callback(&SPI::irq_handler_asynch);

#if __DCACHE_PRESENT
// On devices with a cache, we need to carefully manage the Tx and Rx buffer cache invalidation.
// We can assume that asynchronous SPI implementations might rely on DMA, and that DMA will
// not interoperate with the CPU cache. So, manual flushing/invalidation will be required.
// This page is very useful for how to do this correctly:
// https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
if (tx_length > 0) {
// For chips with a cache, we need to evict the Tx data from cache to main memory.
// This ensures that the DMA controller can see the most up-to-date copy of the data.
SCB_CleanDCache_by_Addr(const_cast<void *>(tx_buffer), tx_length);
}

// Additionally, we have to make sure that there aren't any pending changes which could be written back
// to the Rx buffer memory by the cache at a later date, corrupting the DMA results.
if (rx_length > 0) {
SCB_InvalidateDCache_by_Addr(rx_buffer, rx_length);
}
_transfer_in_progress_rx_buffer = rx_buffer;
_transfer_in_progress_rx_len = rx_length;
#endif

_transfer_in_progress = true;
spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);

_transfer_in_progress_uses_dma = spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
}

void SPI::lock_deep_sleep()
Expand Down Expand Up @@ -508,7 +541,17 @@ void SPI::dequeue_transaction()
void SPI::irq_handler_asynch(void)
{
int event = spi_irq_handler_asynch(&_peripheral->spi);
if (_callback && (event & SPI_EVENT_ALL)) {
if ((event & SPI_EVENT_ALL)) {

#if __DCACHE_PRESENT
if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
// If the cache is present, invalidate the Rx data so it's loaded from main RAM.
// We only want to do this if DMA actually got used for the transfer because, if interrupts
// were used instead, the cache might have the correct data and NOT the main memory.
SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
}
#endif

// If using GPIO CS mode, unless we were asked to keep the peripheral selected, deselect it.
// If there's another transfer queued, we *do* want to deselect the peripheral now.
// It will be reselected in start_transfer() which is called by dequeue_transaction() below.
Expand All @@ -518,7 +561,10 @@ void SPI::irq_handler_asynch(void)
_transfer_in_progress = false;

unlock_deep_sleep();
_callback.call(event & SPI_EVENT_ALL);

if (_callback) {
_callback.call(event & SPI_EVENT_ALL);
}
}
#if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
if (event & (SPI_EVENT_ALL | SPI_EVENT_INTERNAL_TRANSFER_COMPLETE)) {
Expand Down
18 changes: 16 additions & 2 deletions hal/include/hal/spi_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "hal/dma_api.h"
#include "hal/buffer.h"

#include <stdbool.h>

#if DEVICE_SPI

/**
Expand Down Expand Up @@ -393,7 +395,11 @@ const PinMap *spi_slave_cs_pinmap(void);
* @{
*/

/** Begin the SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff
/** Begin the asynchronous SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff.
*
* If the device has a data cache, the Tx data is guaranteed to have been flushed from the cache
* to main memory already. Additionally, the Rx buffer is guaranteed to be cache aligned, and will
* be invalidated by the SPI layer after the transfer is complete.
*
* @param[in] obj The SPI object that holds the transfer information
* @param[in] tx The transmit buffer
Expand All @@ -404,8 +410,16 @@ const PinMap *spi_slave_cs_pinmap(void);
* @param[in] event The logical OR of events to be registered
* @param[in] handler SPI interrupt handler
* @param[in] hint A suggestion for how to use DMA with this transfer
*
* @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
* method is used to do the transfer).
*
* @note On MCUs with a data cache, the return value is used to determine if a cache invalidation needs to be done
* after the transfer is complete. If this function returns true, the driver layer will cache invalidate the Rx buffer under
* the assumption that the data needs to be re-read from main memory. Be careful, because if the read was not actually
* done by DMA, and the rx data is in the CPU cache, this invalidation will corrupt it.
*/
void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);

/** The asynchronous IRQ handler
*
Expand Down
Loading