Skip to content

Commit

Permalink
Handle cache alignment in DMA SPI driver
Browse files Browse the repository at this point in the history
  • Loading branch information
multiplemonomials committed Dec 4, 2023
1 parent d9676cc commit ba3e4e0
Show file tree
Hide file tree
Showing 27 changed files with 414 additions and 98 deletions.
50 changes: 34 additions & 16 deletions drivers/include/drivers/SPI.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "drivers/DigitalOut.h"
#include "platform/SingletonPtr.h"
#include "platform/NonCopyable.h"
#include "platform/CacheAlignedBuffer.h"

#if defined MBED_CONF_DRIVERS_SPI_COUNT_MAX && DEVICE_SPI_COUNT > MBED_CONF_DRIVERS_SPI_COUNT_MAX
#define SPI_PERIPHERALS_USED MBED_CONF_DRIVERS_SPI_COUNT_MAX
Expand Down Expand Up @@ -458,8 +459,9 @@ class SPI : private NonCopyable<SPI> {
* @param tx_buffer The TX buffer with data to be transferred. If NULL is passed,
* the default %SPI value is sent.
* @param tx_length The length of TX buffer in bytes.
* @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
* received data are ignored.
* @param rx_buffer The RX buffer which is used for received data. Rather than a C array, a CacheAlignedBuffer
* structure must be passed so that cache alignment can be handled for data received from DMA.
* May be nullptr if rx_length is 0.
* @param rx_length The length of RX buffer in bytes.
* @param callback The event callback function.
* @param event The logical OR of events to subscribe to. May be #SPI_EVENT_ALL, or some combination
Expand All @@ -469,17 +471,17 @@ class SPI : private NonCopyable<SPI> {
* @retval 0 If the transfer has started.
* @retval -1 if the transfer could not be enqueued (increase drivers.spi_transaction_queue_len option)
*/
template<typename WordT>
template<typename WordT, size_t RxBufferLen>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
{
return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
}

// Overloads of the above to support passing nullptr
template<typename WordT>
template<typename WordT, size_t RxBufferLen>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
{
return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
}
Expand All @@ -502,8 +504,10 @@ class SPI : private NonCopyable<SPI> {
* Internally, the chip vendor may implement this function using either DMA or interrupts.
*
* @param tx_buffer The TX buffer with data to be transferred. May be nullptr if tx_length is 0.
* @param tx_length The length of TX buffer in bytes. If 0, no transmission is done.
* @param rx_buffer The RX buffer, which is used for received data. May be nullptr if tx_length is 0.
* @param tx_length The length of TX buffer in bytes. If 0, the default %SPI data value is sent when receiving data.
* @param rx_buffer The RX buffer which is used for received data. Rather than a C array, a CacheAlignedBuffer
* structure must be passed so that cache alignment can be handled for data received from DMA.
* May be nullptr if rx_length is 0.
* @param rx_length The length of RX buffer in bytes If 0, no reception is done.
* @param timeout timeout value. Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
*
Expand All @@ -513,19 +517,19 @@ class SPI : private NonCopyable<SPI> {
* @retval 2 on other error
* @retval 0 on success
*/
template<typename WordT>
template<typename WordT, size_t RxBufferLen>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer_and_wait(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
{
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
}

// Overloads of the above to support passing nullptr
template<typename WordT>
template<typename WordT, size_t RxBufferLen>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
{
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
}
template<typename WordT>
typename std::enable_if<std::is_integral<WordT>::value, int>::type
Expand Down Expand Up @@ -574,7 +578,8 @@ class SPI : private NonCopyable<SPI> {
* the default SPI value is sent
* @param tx_length The length of TX buffer in bytes.
* @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
* received data are ignored.
* received data are ignored. This buffer is guaranteed to be cache aligned
* if the MCU has a cache.
* @param rx_length The length of RX buffer in bytes.
* @param callback The event callback function.
* @param event The event mask of events to modify.
Expand All @@ -599,6 +604,7 @@ class SPI : private NonCopyable<SPI> {
* @param tx_buffer The TX buffer with data to be transferred. May be nullptr if tx_length is 0.
* @param tx_length The length of TX buffer in bytes. If 0, no transmission is done.
* @param rx_buffer The RX buffer, which is used for received data. May be nullptr if tx_length is 0.
* This buffer is guaranteed to be cache aligned if the MCU has a cache.
* @param rx_length The length of RX buffer in bytes If 0, no reception is done.
* @param timeout timeout value. Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
*
Expand Down Expand Up @@ -722,6 +728,18 @@ class SPI : private NonCopyable<SPI> {
* iff start_transfer() has been called and the chip has been selected but irq_handler_asynch()
* has NOT been called yet. */
volatile bool _transfer_in_progress = false;

// If there is a transfer in progress, this indicates whether it used DMA and therefore requires a cache
// flush at the end
bool _transfer_in_progress_uses_dma;

#if __DCACHE_PRESENT
// These variables store the location and length in bytes of the Rx buffer if an async transfer
// is in progress. They are used for invalidating the cache after the transfer completes.
void * _transfer_in_progress_rx_buffer;
size_t _transfer_in_progress_rx_len;
#endif

/* Event flags used for transfer_and_wait() */
rtos::EventFlags _transfer_and_wait_flags;
#endif // DEVICE_SPI_ASYNCH
Expand Down
57 changes: 54 additions & 3 deletions drivers/source/SPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,17 @@ void SPI::abort_transfer()
_set_ssel(1);
}

#if __DCACHE_PRESENT
if(_transfer_in_progress_uses_dma)
{
// If the cache is present, invalidate the Rx data so it's loaded from main RAM.
// We only want to do this if DMA actually got used for the transfer because, if interrupts
// were used instead, the cache might have the correct data and NOT the main memory.
SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
}
#endif
_transfer_in_progress = false;

#if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
dequeue_transaction();
#endif
Expand Down Expand Up @@ -466,8 +477,33 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,

_callback = callback;
_irq.callback(&SPI::irq_handler_asynch);

#if __DCACHE_PRESENT
// On devices with a cache, we need to carefully manage the Tx and Rx buffer cache invalidation.
// We can assume that asynchronous SPI implementations might rely on DMA, and that DMA will
// not interoperate with the CPU cache. So, manual flushing/invalidation will be required.
// This page is very useful for how to do this correctly:
// https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
if(tx_length > 0)
{
// For chips with a cache, we need to evict the Tx data from cache to main memory.
// This ensures that the DMA controller can see the most up-to-date copy of the data.
SCB_CleanDCache_by_Addr(const_cast<void*>(tx_buffer), tx_length);
}

// Additionally, we have to make sure that there aren't any pending changes which could be written back
// to the Rx buffer memory by the cache at a later date, corrupting the DMA results
if(rx_length > 0)
{
SCB_CleanInvalidateDCache_by_Addr(rx_buffer, rx_length);
}
_transfer_in_progress_rx_buffer = rx_buffer;
_transfer_in_progress_rx_len = rx_length;
#endif

_transfer_in_progress = true;
spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);

_transfer_in_progress_uses_dma = spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
}

void SPI::lock_deep_sleep()
Expand Down Expand Up @@ -508,7 +544,18 @@ void SPI::dequeue_transaction()
void SPI::irq_handler_asynch(void)
{
int event = spi_irq_handler_asynch(&_peripheral->spi);
if (_callback && (event & SPI_EVENT_ALL)) {
if ((event & SPI_EVENT_ALL)) {

#if __DCACHE_PRESENT
if(_transfer_in_progress_uses_dma)
{
// If the cache is present, invalidate the Rx data so it's loaded from main RAM.
// We only want to do this if DMA actually got used for the transfer because, if interrupts
// were used instead, the cache might have the correct data and NOT the main memory.
SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
}
#endif

// If using GPIO CS mode, unless we were asked to keep the peripheral selected, deselect it.
// If there's another transfer queued, we *do* want to deselect the peripheral now.
// It will be reselected in start_transfer() which is called by dequeue_transaction() below.
Expand All @@ -518,7 +565,11 @@ void SPI::irq_handler_asynch(void)
_transfer_in_progress = false;

unlock_deep_sleep();
_callback.call(event & SPI_EVENT_ALL);

if(_callback)
{
_callback.call(event & SPI_EVENT_ALL);
}
}
#if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
if (event & (SPI_EVENT_ALL | SPI_EVENT_INTERNAL_TRANSFER_COMPLETE)) {
Expand Down
18 changes: 16 additions & 2 deletions hal/include/hal/spi_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "hal/dma_api.h"
#include "hal/buffer.h"

#include <stdbool.h>

#if DEVICE_SPI

/**
Expand Down Expand Up @@ -393,7 +395,11 @@ const PinMap *spi_slave_cs_pinmap(void);
* @{
*/

/** Begin the SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff
/** Begin the asynchronous SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff.
*
* If the device has a data cache, the Tx data is guaranteed to have been flushed from the cache
* to main memory already. Additionally, the Rx buffer is guaranteed to be cache aligned, and will
* be invalidated by the SPI layer after the transfer is complete.
*
* @param[in] obj The SPI object that holds the transfer information
* @param[in] tx The transmit buffer
Expand All @@ -404,8 +410,16 @@ const PinMap *spi_slave_cs_pinmap(void);
* @param[in] event The logical OR of events to be registered
* @param[in] handler SPI interrupt handler
* @param[in] hint A suggestion for how to use DMA with this transfer
*
* @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
* method is used to do the transfer).
*
* @note On MCUs with a data cache, the return value is used to determine if a cache invalidation needs to be done
* after the transfer is complete. If this function returns true, the driver layer will cache invalidate the Rx buffer under
* the assumption that the data needs to be re-read from main memory. Be careful, because if the read was not actually
* done by DMA, and the rx data is in the CPU cache, this invalidation will corrupt it.
*/
void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);

/** The asynchronous IRQ handler
*
Expand Down
Loading

0 comments on commit ba3e4e0

Please sign in to comment.