From b5ae14ecc74c4c76b1860500df616c8b83d651da Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Sun, 3 Dec 2023 23:49:45 -0800
Subject: [PATCH 1/8] Handle cache alignment in DMA SPI driver

---
 drivers/include/drivers/SPI.h                 |  50 +++--
 drivers/source/SPI.cpp                        |  57 +++++-
 hal/include/hal/spi_api.h                     |  18 +-
 .../include/platform/CacheAlignedBuffer.h     | 183 ++++++++++++++++++
 .../COMPONENT_SD/include/SD/SDBlockDevice.h   |   6 +-
 .../COMPONENT_SD/source/SDBlockDevice.cpp     |  40 ++--
 .../TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c  |   4 +-
 .../TARGET_MCU_K64F/spi_api.c                 |   4 +-
 .../TARGET_NRF5x/TARGET_NRF52/spi_api.c       |  10 +-
 targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c |   4 +-
 targets/TARGET_NUVOTON/TARGET_M251/spi_api.c  |   4 +-
 targets/TARGET_NUVOTON/TARGET_M261/spi_api.c  |   4 +-
 targets/TARGET_NUVOTON/TARGET_M451/spi_api.c  |   4 +-
 targets/TARGET_NUVOTON/TARGET_M460/spi_api.c  |   4 +-
 targets/TARGET_NUVOTON/TARGET_M480/spi_api.c  |   4 +-
 .../TARGET_NUVOTON/TARGET_NANO100/spi_api.c   |   4 +-
 .../TARGET_NUVOTON/TARGET_NUC472/spi_api.c    |   4 +-
 .../TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c   |   4 +-
 .../TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c   |   4 +-
 .../stm32h7xx_hal_dma_ex.c                    |  13 +-
 targets/TARGET_STM/stm_spi_api.c              |  68 +++----
 targets/TARGET_STM/stm_spi_api.h              |   1 +
 .../TARGET_EFM32/spi_api.c                    |   2 +-
 .../TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c   |   4 +-
 .../TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c   |   4 +-
 .../TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c   |   4 +-
 .../TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c   |   4 +-
 27 files changed, 414 insertions(+), 98 deletions(-)
 create mode 100644 platform/include/platform/CacheAlignedBuffer.h
diff --git a/drivers/include/drivers/SPI.h b/drivers/include/drivers/SPI.h
index 69df632d333..2c330266883 100644
--- a/drivers/include/drivers/SPI.h
+++ b/drivers/include/drivers/SPI.h
@@ -26,6 +26,7 @@
 #include "drivers/DigitalOut.h"
 #include "platform/SingletonPtr.h"
 #include "platform/NonCopyable.h"
+#include "platform/CacheAlignedBuffer.h"
 
 #if defined MBED_CONF_DRIVERS_SPI_COUNT_MAX && DEVICE_SPI_COUNT > MBED_CONF_DRIVERS_SPI_COUNT_MAX
 #define SPI_PERIPHERALS_USED MBED_CONF_DRIVERS_SPI_COUNT_MAX
@@ -458,8 +459,9 @@ class SPI : private NonCopyable<SPI> {
      * @param tx_buffer The TX buffer with data to be transferred. If NULL is passed,
      *                  the default %SPI value is sent.
      * @param tx_length The length of TX buffer in bytes.
-     * @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
-     *                  received data are ignored.
+     * @param rx_buffer The RX buffer which is used for received data.  Rather than a C array, a CacheAlignedBuffer
+     *                  structure must be passed so that cache alignment can be handled for data received from DMA.
+     *                  May be nullptr if rx_length is 0.
      * @param rx_length The length of RX buffer in bytes.
      * @param callback  The event callback function.
      * @param event     The logical OR of events to subscribe to.  May be #SPI_EVENT_ALL, or some combination
@@ -469,17 +471,17 @@ class SPI : private NonCopyable<SPI> {
      * @retval 0 If the transfer has started.
      * @retval -1 if the transfer could not be enqueued (increase drivers.spi_transaction_queue_len option)
      */
-    template<typename WordT>
+    template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
-        return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
+        return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
     }
 
     // Overloads of the above to support passing nullptr
-    template<typename WordT>
+    template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
         return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
     }
@@ -502,8 +504,10 @@ class SPI : private NonCopyable<SPI> {
      * Internally, the chip vendor may implement this function using either DMA or interrupts.
      *
      * @param tx_buffer The TX buffer with data to be transferred.  May be nullptr if tx_length is 0.
-     * @param tx_length The length of TX buffer in bytes.  If 0, no transmission is done.
-     * @param rx_buffer The RX buffer, which is used for received data.  May be nullptr if tx_length is 0.
+     * @param tx_length The length of TX buffer in bytes.  If 0, the default %SPI data value is sent when receiving data.
+     * @param rx_buffer The RX buffer which is used for received data.  Rather than a C array, a CacheAlignedBuffer
+     *                  structure must be passed so that cache alignment can be handled for data received from DMA.
+     *                  May be nullptr if rx_length is 0.
      * @param rx_length The length of RX buffer in bytes  If 0, no reception is done.
      * @param timeout timeout value.  Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
      *
@@ -513,19 +517,19 @@ class SPI : private NonCopyable<SPI> {
      * @retval 2 on other error
      * @retval 0 on success
      */
-    template<typename WordT>
+    template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const WordT *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
-        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
+        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
 
     // Overloads of the above to support passing nullptr
-    template<typename WordT>
+    template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, WordT *rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
-        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer, rx_length, timeout);
+        return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
     template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
@@ -574,7 +578,8 @@ class SPI : private NonCopyable<SPI> {
      *                  the default SPI value is sent
      * @param tx_length The length of TX buffer in bytes.
      * @param rx_buffer The RX buffer which is used for received data. If NULL is passed,
-     *                  received data are ignored.
+     *                  received data are ignored.  This buffer is guaranteed to be cache aligned
+     *                  if the MCU has a cache.
      * @param rx_length The length of RX buffer in bytes.
      * @param callback  The event callback function.
      * @param event     The event mask of events to modify.
@@ -599,6 +604,7 @@ class SPI : private NonCopyable<SPI> {
      * @param tx_buffer The TX buffer with data to be transferred.  May be nullptr if tx_length is 0.
      * @param tx_length The length of TX buffer in bytes.  If 0, no transmission is done.
      * @param rx_buffer The RX buffer, which is used for received data.  May be nullptr if tx_length is 0.
+     *     This buffer is guaranteed to be cache aligned if the MCU has a cache.
      * @param rx_length The length of RX buffer in bytes  If 0, no reception is done.
      * @param timeout timeout value.  Use #rtos::Kernel::wait_for_u32_forever to wait forever (the default).
      *
@@ -722,6 +728,18 @@ class SPI : private NonCopyable<SPI> {
      * iff start_transfer() has been called and the chip has been selected but irq_handler_asynch()
      * has NOT been called yet. */
     volatile bool _transfer_in_progress = false;
+
+    // If there is a transfer in progress, this indicates whether it used DMA and therefore requires a cache
+    // flush at the end
+    bool _transfer_in_progress_uses_dma;
+
+#if __DCACHE_PRESENT
+    // These variables store the location and length in bytes of the Rx buffer if an async transfer
+    // is in progress.  They are used for invalidating the cache after the transfer completes.
+    void * _transfer_in_progress_rx_buffer;
+    size_t _transfer_in_progress_rx_len;
+#endif
+
     /* Event flags used for transfer_and_wait() */
     rtos::EventFlags _transfer_and_wait_flags;
 #endif // DEVICE_SPI_ASYNCH
diff --git a/drivers/source/SPI.cpp b/drivers/source/SPI.cpp
index 376ec7c708b..af7368e7b93 100644
--- a/drivers/source/SPI.cpp
+++ b/drivers/source/SPI.cpp
@@ -396,6 +396,17 @@ void SPI::abort_transfer()
             _set_ssel(1);
         }
 
+#if __DCACHE_PRESENT
+        if(_transfer_in_progress_uses_dma)
+        {
+            // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
+            // We only want to do this if DMA actually got used for the transfer because, if interrupts
+            // were used instead, the cache might have the correct data and NOT the main memory.
+            SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
+        }
+#endif
+        _transfer_in_progress = false;
+
 #if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
         dequeue_transaction();
 #endif
@@ -466,8 +477,33 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,
 
     _callback = callback;
     _irq.callback(&SPI::irq_handler_asynch);
+
+#if __DCACHE_PRESENT
+    // On devices with a cache, we need to carefully manage the Tx and Rx buffer cache invalidation.
+    // We can assume that asynchronous SPI implementations might rely on DMA, and that DMA will
+    // not interoperate with the CPU cache.  So, manual flushing/invalidation will be required.
+    // This page is very useful for how to do this correctly:
+    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+    if(tx_length > 0)
+    {
+        // For chips with a cache, we need to evict the Tx data from cache to main memory.
+        // This ensures that the DMA controller can see the most up-to-date copy of the data.
+        SCB_CleanDCache_by_Addr(const_cast<void*>(tx_buffer), tx_length);
+    }
+
+    // Additionally, we have to make sure that there aren't any pending changes which could be written back
+    // to the Rx buffer memory by the cache at a later date, corrupting the DMA results
+    if(rx_length > 0)
+    {
+        SCB_CleanInvalidateDCache_by_Addr(rx_buffer, rx_length);
+    }
+    _transfer_in_progress_rx_buffer = rx_buffer;
+    _transfer_in_progress_rx_len = rx_length;
+#endif
+
     _transfer_in_progress = true;
-    spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
+
+    _transfer_in_progress_uses_dma = spi_master_transfer(&_peripheral->spi, tx_buffer, tx_length, rx_buffer, rx_length, bit_width, _irq.entry(), event, _usage);
 }
 
 void SPI::lock_deep_sleep()
@@ -508,7 +544,18 @@ void SPI::dequeue_transaction()
 void SPI::irq_handler_asynch(void)
 {
     int event = spi_irq_handler_asynch(&_peripheral->spi);
-    if (_callback && (event & SPI_EVENT_ALL)) {
+    if ((event & SPI_EVENT_ALL)) {
+
+#if __DCACHE_PRESENT
+        if(_transfer_in_progress_uses_dma)
+        {
+            // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
+            // We only want to do this if DMA actually got used for the transfer because, if interrupts
+            // were used instead, the cache might have the correct data and NOT the main memory.
+            SCB_InvalidateDCache_by_Addr(_transfer_in_progress_rx_buffer, _transfer_in_progress_rx_len);
+        }
+#endif
+
         // If using GPIO CS mode, unless we were asked to keep the peripheral selected, deselect it.
         // If there's another transfer queued, we *do* want to deselect the peripheral now.
         // It will be reselected in start_transfer() which is called by dequeue_transaction() below.
@@ -518,7 +565,11 @@ void SPI::irq_handler_asynch(void)
         _transfer_in_progress = false;
 
         unlock_deep_sleep();
-        _callback.call(event & SPI_EVENT_ALL);
+
+        if(_callback)
+        {
+            _callback.call(event & SPI_EVENT_ALL);
+        }
     }
 #if MBED_CONF_DRIVERS_SPI_TRANSACTION_QUEUE_LEN
     if (event & (SPI_EVENT_ALL | SPI_EVENT_INTERNAL_TRANSFER_COMPLETE)) {
diff --git a/hal/include/hal/spi_api.h b/hal/include/hal/spi_api.h
index 65853c69b3b..011073a02a7 100644
--- a/hal/include/hal/spi_api.h
+++ b/hal/include/hal/spi_api.h
@@ -25,6 +25,8 @@
 #include "hal/dma_api.h"
 #include "hal/buffer.h"
 
+#include <stdbool.h>
+
 #if DEVICE_SPI
 
 /**
@@ -393,7 +395,11 @@ const PinMap *spi_slave_cs_pinmap(void);
  * @{
  */
 
-/** Begin the SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff
+/** Begin the asynchronous SPI transfer. Buffer pointers and lengths are specified in tx_buff and rx_buff.
+ *
+ * If the device has a data cache, the Tx data is guaranteed to have been flushed from the cache
+ * to main memory already.  Additionally, the Rx buffer is guaranteed to be cache aligned, and will
+ * be invalidated by the SPI layer after the transfer is complete.
  *
  * @param[in] obj       The SPI object that holds the transfer information
  * @param[in] tx        The transmit buffer
@@ -404,8 +410,16 @@ const PinMap *spi_slave_cs_pinmap(void);
  * @param[in] event     The logical OR of events to be registered
  * @param[in] handler   SPI interrupt handler
  * @param[in] hint      A suggestion for how to use DMA with this transfer
+ *
+ * @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
+ * method is used to do the transfer).
+ *
+ * @note On MCUs with a data cache, the return value is used to determine if a cache invalidation needs to be done
+ * after the transfer is complete.  If this function returns true, the driver layer will cache invalidate the Rx buffer under
+ * the assumption that the data needs to be re-read from main memory.  Be careful, because if the read was not actually
+ * done by DMA, and the rx data is in the CPU cache, this invalidation will corrupt it.
  */
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint);
 
 /** The asynchronous IRQ handler
  *
diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
new file mode 100644
index 00000000000..61b6f09f6a0
--- /dev/null
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -0,0 +1,183 @@
+/* mbed Microcontroller Library
+ * Copyright (c) 2006-2023 ARM Limited
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MBED_CACHEALIGNEDBUFFER_H
+#define MBED_CACHEALIGNEDBUFFER_H
+
+#include <cstdlib>
+
+#include "device.h"
+
+namespace mbed {
+
+
+/**
+ * @brief CacheAlignedBuffer is used by Mbed in locations where we need a cache-aligned buffer.
+ *
+ * Cache alignment is desirable in several different situations in embedded programming; often
+ * when working with DMA or other peripherals which write their results back to main memory.
+ * In order to read those results from memory without risk of getting old data from the
+ * CPU cache, one needs to align the buffer so it takes up an integer number of cache lines,
+ * then invalidate the cache lines so that the data gets reread from RAM.
+ *
+ * CacheAlignedBuffer provides an easy way to allocate the correct amount of space so that
+ * a buffer of any size can be made cache-aligned.
+ *
+ * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
+ *    using class types as DataT, and will not call constructors.
+ * @tparam BufferSize Buffer size (number of elements) needed by the application for the buffer.
+ */
+template<typename DataT, size_t BufferSize>
+struct
+CacheAlignedBuffer {
+private:
+#if __DCACHE_PRESENT
+    // Allocate enough extra space that we can shift the start of the buffer forward to be on a cache line.
+    // The worst case for this is when the first byte is allocated 1 byte past the start of a cache line, so we
+    // will need an additional (cache line size - 1) bytes.
+    // Additionally, since we are going to be invalidating this buffer, we can't allow any other variables to be
+    // in the same cache lines, or they might get corrupted.
+    // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
+    // The math for rounding up can be found here:
+    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+    constexpr static size_t requiredSizeRoundedUp =  (BufferSize * sizeof(DataT) + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
+    constexpr static size_t backingBufferSizeBytes = requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
+#else
+    constexpr static size_t backingBufferSize = BufferSize * sizeof(DataT);
+#endif
+
+    uint8_t _backingBuffer[backingBufferSizeBytes];
+    DataT * _alignedArrayPtr;
+
+    /**
+     * Find and return the first location in the given buffer that starts on a cache line.
+     * If this MCU does not use a cache, this function is a no-op.
+     *
+     * @param buffer Pointer to buffer
+     *
+     * @return Pointer to first data item, aligned at the start of a cache line.
+     */
+    inline DataT * findCacheLineStart(uint8_t * buffer)
+    {
+#if __DCACHE_PRESENT
+        // Use integer division to divide the address down to the cache line size, which
+        // rounds to the cache line before the given address.
+        // So that we always go one cache line back even if the given address is on a cache line,
+        // subtract 1.
+        ptrdiff_t prevCacheLine = ((ptrdiff_t)(buffer - 1)) / __SCB_DCACHE_LINE_SIZE;
+
+        // Now we just have to multiply up again to get an address (adding 1 to go forward by 1 cache line)
+        return reinterpret_cast<DataT *>((prevCacheLine + 1) * __SCB_DCACHE_LINE_SIZE);
+#else
+        return reinterpret_cast<DataT *>(buffer);
+#endif
+    }
+
+public:
+
+    // Iterator types
+    typedef DataT * iterator;
+    typedef DataT const * const_iterator;
+
+    /**
+     * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized.
+     */
+    CacheAlignedBuffer():
+    _backingBuffer{},
+    _alignedArrayPtr(findCacheLineStart(_backingBuffer))
+    {}
+
+    /**
+     * @brief Copy from other cache-aligned buffer.  Buffer memory will be copied.
+     */
+    CacheAlignedBuffer(CacheAlignedBuffer const & other):
+    _alignedArrayPtr(findCacheLineStart(_backingBuffer))
+    {
+        memcpy(this->_alignedArrayPtr, other._alignedArrayPtr, BufferSize * sizeof(DataT));
+    }
+
+    /**
+     * @brief Assign from other cache-aligned buffer.  Buffer memory will be assigned.
+     */
+    CacheAlignedBuffer & operator=(CacheAlignedBuffer const & other)
+    {
+        memcpy(this->_alignedArrayPtr, other._alignedArrayPtr, BufferSize * sizeof(DataT));
+    }
+
+    /**
+     * @brief Get a pointer to the aligned data array inside the buffer
+     */
+    DataT * data() { return _alignedArrayPtr; }
+
+    /**
+     * @brief Get a pointer to the aligned data array inside the buffer (const version)
+     */
+    DataT const * data() const { return _alignedArrayPtr; }
+
+    /**
+     * @brief Element access
+     */
+    DataT & operator[](size_t index) { return _alignedArrayPtr[index]; }
+
+    /**
+     * @brief Element access (const)
+     */
+    DataT operator[](size_t index) const { return _alignedArrayPtr[index]; }
+
+    /**
+     * @brief Get iterator for start of buffer
+     */
+    iterator begin() { return _alignedArrayPtr; }
+
+    /**
+     * @brief Get iterator for start of buffer
+     */
+    const_iterator begin() const { return _alignedArrayPtr; }
+
+    /**
+     * @brief Get iterator for end of buffer
+     */
+    iterator end() { return _alignedArrayPtr + BufferSize; }
+
+    /**
+     * @brief Get iterator for end of buffer
+     */
+    const_iterator end() const { return _alignedArrayPtr + BufferSize; }
+
+    /**
+     * @return The maximum amount of DataT elements that this buffer can hold
+     */
+    constexpr size_t capacity() { return BufferSize; }
+
+    /**
+     * @brief If this MCU has a data cache, this function _invalidates_ the buffer in the data cache.
+     *
+     * Invalidation means that the next time code access the buffer data, it is guaranteed to be fetched from
+     * main memory rather than from the CPU cache.  If there are changes made to the data which have not been
+     * flushed back to main memory, those changes will be lost!
+     */
+    void invalidate()
+    {
+#if __DCACHE_PRESENT
+        SCB_InvalidateDCache_by_Addr(_alignedArrayPtr, BufferSize * sizeof(DataT));
+#endif
+    }
+};
+
+}
+
+#endif //MBED_CACHEALIGNEDBUFFER_H
diff --git a/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h b/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
index b1c7562596f..246fed59538 100644
--- a/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
+++ b/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
@@ -57,6 +57,10 @@
  * Access an SD Card using SPI bus
  */
 class SDBlockDevice : public mbed::BlockDevice {
+
+    // Only HC block size is supported. Making this a static constant reduces code size.
+    static constexpr uint32_t _block_size = 512; /*!< Block size supported for SDHC card is 512 bytes  */
+
 public:
     /** Creates an SDBlockDevice on a SPI bus specified by pins (using dynamic pin-map)
      *
@@ -302,7 +306,6 @@ class SDBlockDevice : public mbed::BlockDevice {
     }
 
     rtos::Mutex _mutex;
-    static const uint32_t _block_size;
     uint32_t _erase_size;
     bool _is_initialized;
     bool _dbg;
@@ -314,6 +317,7 @@ class SDBlockDevice : public mbed::BlockDevice {
 
 #if DEVICE_SPI_ASYNCH
     bool _async_spi_enabled = false;
+    mbed::CacheAlignedBuffer<uint8_t, _block_size> _async_data_buffer;
 #endif
 };
 
diff --git a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
index 381071b6803..8e9ac5fb902 100644
--- a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
+++ b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
@@ -181,7 +181,6 @@ using namespace std::chrono;
 #define SD_BLOCK_DEVICE_ERROR_ERASE              -5010  /*!< Erase error: reset/sequence */
 #define SD_BLOCK_DEVICE_ERROR_WRITE              -5011  /*!< SPI Write error: !SPI_DATA_ACCEPTED */
 
-#define BLOCK_SIZE_HC                            512    /*!< Block size supported for SD card is 512 bytes  */
 #define WRITE_BL_PARTIAL                         0      /*!< Partial block write - Not supported */
 #define SPI_CMD(x) (0x40 | (x & 0x3f))
 
@@ -248,9 +247,6 @@ using namespace std::chrono;
 #define SPI_READ_ERROR_ECC_C     (0x1 << 2)  /*!< Card ECC failed */
 #define SPI_READ_ERROR_OFR       (0x1 << 3)  /*!< Out of Range */
 
-// Only HC block size is supported. Making this a static constant reduces code size.
-const uint32_t SDBlockDevice::_block_size = BLOCK_SIZE_HC;
-
 #if MBED_CONF_SD_CRC_ENABLED
 SDBlockDevice::SDBlockDevice(PinName mosi, PinName miso, PinName sclk, PinName cs, uint64_t hz, bool crc_on)
     : _sectors(0), _spi(mosi, miso, sclk, cs, use_gpio_ssel), _is_initialized(0),
@@ -269,7 +265,7 @@ SDBlockDevice::SDBlockDevice(PinName mosi, PinName miso, PinName sclk, PinName c
     _init_sck = MBED_CONF_SD_INIT_FREQUENCY;
     _transfer_sck = hz;
 
-    _erase_size = BLOCK_SIZE_HC;
+    _erase_size = _block_size;
 }
 
 #if MBED_CONF_SD_CRC_ENABLED
@@ -290,7 +286,7 @@ SDBlockDevice::SDBlockDevice(const spi_pinmap_t &spi_pinmap, PinName cs, uint64_
     _init_sck = MBED_CONF_SD_INIT_FREQUENCY;
     _transfer_sck = hz;
 
-    _erase_size = BLOCK_SIZE_HC;
+    _erase_size = _block_size;
 }
 
 SDBlockDevice::~SDBlockDevice()
@@ -925,7 +921,16 @@ int SDBlockDevice::_read_bytes(uint8_t *buffer, uint32_t length)
     // read data
 #if DEVICE_SPI_ASYNCH
     if (_async_spi_enabled) {
-        _spi.transfer_and_wait(nullptr, 0, buffer, length);
+        if(length > _async_data_buffer.capacity())
+        {
+            return SD_BLOCK_DEVICE_ERROR_PARAMETER;
+        }
+
+        // Do read into cache aligned buffer, then copy data into application buffer
+        if (_spi.transfer_and_wait(nullptr, 0, _async_data_buffer, length) != 0) {
+            return SD_BLOCK_DEVICE_ERROR_WRITE;
+        }
+        memcpy(buffer, _async_data_buffer.data(), length);
     } else
 #endif
     {
@@ -968,9 +973,16 @@ int SDBlockDevice::_read(uint8_t *buffer, uint32_t length)
     // read data
 #if DEVICE_SPI_ASYNCH
     if (_async_spi_enabled) {
-        if (_spi.transfer_and_wait(nullptr, 0, buffer, length) != 0) {
+        if(length > _async_data_buffer.capacity())
+        {
+            return SD_BLOCK_DEVICE_ERROR_PARAMETER;
+        }
+
+        // Do read into cache aligned buffer, then copy data into application buffer
+        if (_spi.transfer_and_wait(nullptr, 0, _async_data_buffer, length) != 0) {
             return SD_BLOCK_DEVICE_ERROR_WRITE;
         }
+        memcpy(buffer, _async_data_buffer.data(), length);
     } else
 #endif
     {
@@ -1067,7 +1079,13 @@ bd_size_t SDBlockDevice::_sd_sectors()
         debug_if(SD_DBG, "Didn't get a response from the disk\n");
         return 0;
     }
+#ifdef DEVICE_SPI_ASYNCH
+    CacheAlignedBuffer<uint8_t, 16> csd_buffer;
+    uint8_t * csd = csd_buffer.data();
+#else
     uint8_t csd[16];
+#endif
+
     if (_read_bytes(csd, 16) != 0) {
         debug_if(SD_DBG, "Couldn't read csd response from disk\n");
         return 0;
@@ -1091,10 +1109,10 @@ bd_size_t SDBlockDevice::_sd_sectors()
 
             // ERASE_BLK_EN = 1: Erase in multiple of 512 bytes supported
             if (ext_bits(csd, 46, 46)) {
-                _erase_size = BLOCK_SIZE_HC;
+                _erase_size = _block_size;
             } else {
                 // ERASE_BLK_EN = 1: Erase in multiple of SECTOR_SIZE supported
-                _erase_size = BLOCK_SIZE_HC * (ext_bits(csd, 45, 39) + 1);
+                _erase_size = _block_size * (ext_bits(csd, 45, 39) + 1);
             }
             break;
 
@@ -1105,7 +1123,7 @@ bd_size_t SDBlockDevice::_sd_sectors()
             debug_if(SD_DBG, "Sectors: 0x%" PRIx64 "x : %" PRIu64 "\n", blocks, blocks);
             debug_if(SD_DBG, "Capacity: %" PRIu64 " MB\n", (blocks / (2048U)));
             // ERASE_BLK_EN is fixed to 1, which means host can erase one or multiple of 512 bytes.
-            _erase_size = BLOCK_SIZE_HC;
+            _erase_size = _block_size;
             break;
 
         default:
diff --git a/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c b/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c
index 6780ecff399..de40eed7301 100644
--- a/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c
+++ b/targets/TARGET_Cypress/TARGET_PSOC6/cy_spi_api.c
@@ -234,7 +234,7 @@ const PinMap *spi_slave_cs_pinmap(void)
 
 #if DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, MBED_UNUSED uint8_t bit_width, uint32_t handler, uint32_t event, MBED_UNUSED DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, MBED_UNUSED uint8_t bit_width, uint32_t handler, uint32_t event, MBED_UNUSED DMAUsage hint)
 {
     struct spi_s *spi = cy_get_spi(obj);
     spi->async_handler = (void (*)(void))handler;
@@ -248,6 +248,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         MBED_ERROR(MBED_MAKE_ERROR(MBED_MODULE_DRIVER_SPI, MBED_ERROR_CODE_FAILED_OPERATION), "cyhal_spi_transfer_async");
     }
     spi->async_in_progress = true;
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c b/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c
index a35f0a9cdc0..1ac6a1856bb 100644
--- a/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c
+++ b/targets/TARGET_Freescale/TARGET_MCUXpresso_MCUS/TARGET_MCU_K64F/spi_api.c
@@ -358,7 +358,7 @@ static void spi_buffer_set(spi_t *obj, const void *tx, uint32_t tx_length, void
     obj->rx_buff.width = bit_width;
 }
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     if (spi_active(obj)) {
         return;
@@ -400,6 +400,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         // Can't enter deep sleep as long as SPI transfer is active
         sleep_manager_lock_deep_sleep();
     }
+
+    return hint != DMA_USAGE_NEVER; // DMA will be used as long as the hint is not NEVER
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c b/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
index 3cd900edca1..77b0e0a70f9 100644
--- a/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
+++ b/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
@@ -882,8 +882,10 @@ static void nordic_nrf5_spi_event_handler(nrfx_spi_evt_t const *p_event, void *p
  * Parameter  event     The logical OR of events to be registered
  * Parameter  handler   SPI interrupt handler
  * Parameter  hint      A suggestion for how to use DMA with this transfer
+ *
+ * Returns    bool      True if DMA was used for the transfer, false otherwise
  */
-void spi_master_transfer(spi_t *obj,
+bool spi_master_transfer(spi_t *obj,
                          const void *tx,
                          size_t tx_length,
                          void *rx,
@@ -949,6 +951,12 @@ void spi_master_transfer(spi_t *obj,
         /* Signal callback handler. */
         callback();
     }
+
+#if NRFX_CHECK(NRFX_SPIM_ENABLED)
+    return true; // We always use DMA when the SPIM preripheral is available
+#else
+    return false; // We always use interrupts when the SPI peripheral is available
+#endif
 }
 
 /** The asynchronous IRQ handler
diff --git a/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c
index 3e6342935aa..3a8cd3f1276 100644
--- a/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M2354/spi_api.c
@@ -445,7 +445,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -573,6 +573,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c
index d8e9d378aca..25eeb2d1c32 100644
--- a/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M251/spi_api.c
@@ -369,7 +369,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -497,6 +497,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c
index ebf7da50f25..313781728b7 100644
--- a/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M261/spi_api.c
@@ -390,7 +390,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -518,6 +518,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
index 8b2de0ed2e4..2e80b1fb691 100644
--- a/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
@@ -383,7 +383,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -501,6 +501,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c
index c9f7d4802e8..126bde64428 100644
--- a/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M460/spi_api.c
@@ -488,7 +488,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -616,6 +616,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c
index ca5cf79e8b4..e3d2b552e11 100644
--- a/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M480/spi_api.c
@@ -439,7 +439,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -567,6 +567,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c b/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
index 3862edf5162..c546862fc93 100644
--- a/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
@@ -434,7 +434,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -549,6 +549,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         PDMA_Trigger(obj->spi.dma_chn_id_rx);
         PDMA_Trigger(obj->spi.dma_chn_id_tx);
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c b/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
index ddcf800de57..2c154dff563 100644
--- a/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
@@ -389,7 +389,7 @@ void spi_slave_write(spi_t *obj, int value)
 #endif
 
 #if DEVICE_SPI_ASYNCH
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     SPI_T *spi_base = (SPI_T *) NU_MODBASE(obj->spi.spi);
     SPI_SET_DATA_WIDTH(spi_base, bit_width);
@@ -503,6 +503,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
         /* Don't enable SPI TX/RX threshold interrupts as commented above */
     }
+
+    return obj->spi.dma_usage != DMA_USAGE_NEVER;
 }
 
 /**
diff --git a/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c b/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c
index 294de6efba6..0b6a09c17b4 100644
--- a/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c
+++ b/targets/TARGET_RENESAS/TARGET_RZ_A1XX/spi_api.c
@@ -518,7 +518,7 @@ static void spi_async_read(spi_t *obj)
  * ASYNCHRONOUS HAL
  ******************************************************************************/
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     int i;
     MBED_ASSERT(obj);
@@ -556,6 +556,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     spi_irqs_set(obj, 1);
     
     spi_async_write(obj);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c b/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c
index 295ae9c4876..4e402514e45 100644
--- a/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c
+++ b/targets/TARGET_RENESAS/TARGET_RZ_A2XX/spi_api.c
@@ -526,7 +526,7 @@ static void spi_async_read(spi_t *obj)
  * ASYNCHRONOUS HAL
  ******************************************************************************/
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     int i;
     MBED_ASSERT(obj);
@@ -564,6 +564,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     spi_irqs_set(obj, 1);
 
     spi_async_write(obj);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c b/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c
index e55c3d54ceb..c5f2ef7d9d2 100644
--- a/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c
+++ b/targets/TARGET_STM/TARGET_STM32H7/STM32Cube_FW/STM32H7xx_HAL_Driver/stm32h7xx_hal_dma_ex.c
@@ -290,7 +290,18 @@ HAL_StatusTypeDef HAL_DMAEx_MultiBufferStart_IT(DMA_HandleTypeDef *hdma, uint32_
     {
       /* Enable Common interrupts*/
       MODIFY_REG(((DMA_Stream_TypeDef   *)hdma->Instance)->CR, (DMA_IT_TC | DMA_IT_TE | DMA_IT_DME | DMA_IT_HT), (DMA_IT_TC | DMA_IT_TE | DMA_IT_DME));
-      ((DMA_Stream_TypeDef   *)hdma->Instance)->FCR |= DMA_IT_FE;
+
+      /* Mbed CE mod: Only enable the FIFO Error interrupt if the FIFO is actually enabled.
+       * If it's not enabled, then this interrupt can trigger spuriously from memory bus
+       * stalls that the DMA engine encounters, and this creates random DMA failures.
+       * Reference forum thread here:
+       * https://community.st.com/t5/stm32-mcus-products/spi-dma-fifo-error-issue-feifx/td-p/537074
+       * also: https://community.st.com/t5/stm32-mcus-touch-gfx-and-gui/spi-dma-error-is-occurred-when-the-other-dma-memory-to-memory-is/td-p/191590
+       */
+      if(((DMA_Stream_TypeDef *)hdma->Instance)->FCR & DMA_SxFCR_DMDIS)
+      {
+        ((DMA_Stream_TypeDef *)hdma->Instance)->FCR |= DMA_IT_FE;
+      }
 
       if((hdma->XferHalfCpltCallback != NULL) || (hdma->XferM1HalfCpltCallback != NULL))
       {
diff --git a/targets/TARGET_STM/stm_spi_api.c b/targets/TARGET_STM/stm_spi_api.c
index f444afeb1e6..65d65e059cf 100644
--- a/targets/TARGET_STM/stm_spi_api.c
+++ b/targets/TARGET_STM/stm_spi_api.c
@@ -1410,13 +1410,15 @@ int spi_master_block_write(spi_t *obj, const char *tx_buffer, int tx_length,
     int total = (tx_length > rx_length) ? tx_length : rx_length;
 
     if (handle->Init.Direction == SPI_DIRECTION_2LINES) {
+        const int word_size = 0x01 << bitshift;
+
         int write_fill_frame = write_fill;
         /* extend fill symbols for 16/32 bit modes */
-        for (int i = 0; i < bitshift; i++) {
+        for (int i = 0; i < word_size; i++) {
             write_fill_frame = (write_fill_frame << 8) | write_fill;
         }
 
-        const int word_size = 0x01 << bitshift;
+
         for (int i = 0; i < total; i += word_size) {
             int out = (i < tx_length) ? spi_get_word_from_buffer(tx_buffer + i, bitshift) : write_fill_frame;
             int in = spi_master_write(obj, out);
@@ -1534,8 +1536,8 @@ typedef enum {
 } transfer_type_t;
 
 
-/// @returns the number of bytes transferred, or `0` if nothing transferred
-static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer_type, const void *tx, void *rx, size_t length, DMAUsage hint)
+/// @returns True if DMA was used, false otherwise
+static bool spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer_type, const void *tx, void *rx, size_t length, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
     SPI_HandleTypeDef *handle = &(spiobj->handle);
@@ -1547,7 +1549,6 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
     size_t words;
 
     obj->spi.transfer_type = transfer_type;
-
     words = length >> bitshift;
 
     bool useDMA = false;
@@ -1579,6 +1580,8 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
 
         useDMA = true;
     }
+
+    obj->spi.curr_transfer_uses_dma = useDMA;
 #endif
 
     DEBUG_PRINTF("SPI inst=0x%8X Start: type=%u, length=%u, DMA=%d\r\n", (int) handle->Instance, transfer_type, length, !!useDMA);
@@ -1590,15 +1593,6 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
     NVIC_SetPriority(irq_n, 1);
     NVIC_EnableIRQ(irq_n);
 
-#if defined(STM32_SPI_CAPABILITY_DMA) && defined(__DCACHE_PRESENT)
-    if (useDMA && transfer_type != SPI_TRANSFER_TYPE_RX)
-    {
-        // For chips with a cache (e.g. Cortex-M7), we need to evict the Tx data from cache to main memory.
-        // This ensures that the DMA controller can see the most up-to-date copy of the data.
-        SCB_CleanDCache_by_Addr((volatile void*)tx, length);
-    }
-#endif
-
     // flush FIFO
 #if defined(SPI_FLAG_FRLVL)
     HAL_SPIEx_FlushRxFifo(handle);
@@ -1635,7 +1629,7 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
 
             if (useDMA) {
 #if defined(STM32_SPI_CAPABILITY_DMA) && defined(__DCACHE_PRESENT)
-                // For chips with a cache (e.g. Cortex-M7), we need to evict the Tx data from cache to main memory.
+                // For chips with a cache (e.g. Cortex-M7), we need to evict the Tx fill data from cache to main memory.
                 // This ensures that the DMA controller can see the most up-to-date copy of the data.
                 SCB_CleanDCache_by_Addr(rx, length);
 #endif
@@ -1650,15 +1644,6 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
             length = 0;
     }
 
-#if defined(STM32_SPI_CAPABILITY_DMA) && defined(__DCACHE_PRESENT)
-    if (useDMA && transfer_type != SPI_TRANSFER_TYPE_TX)
-    {
-        // For chips with a cache (e.g. Cortex-M7), we need to invalidate the Rx data in cache.
-        // This ensures that the CPU will fetch the data from SRAM instead of using its cache.
-        SCB_InvalidateDCache_by_Addr(rx, length);
-    }
-#endif
-
     if (rc) {
 #if defined(SPI_IP_VERSION_V2)
         // enable SPI back in case of error
@@ -1667,14 +1652,13 @@ static int spi_master_start_asynch_transfer(spi_t *obj, transfer_type_t transfer
         }
 #endif
         DEBUG_PRINTF("SPI: RC=%u\n", rc);
-        length = 0;
     }
 
-    return length;
+    return useDMA;
 }
 
 // asynchronous API
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
     SPI_HandleTypeDef *handle = &(spiobj->handle);
@@ -1687,7 +1671,7 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
     // don't do anything, if the buffers aren't valid
     if (!use_tx && !use_rx) {
-        return;
+        return false;
     }
 
     // copy the buffers to the SPI object
@@ -1717,11 +1701,14 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
             obj->tx_buff.length = size;
             obj->rx_buff.length = size;
         }
-        spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TXRX, tx, rx, size, hint);
+        return spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TXRX, tx, rx, size, hint);
     } else if (use_tx) {
-        spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TX, tx, NULL, tx_length, hint);
+        return spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_TX, tx, NULL, tx_length, hint);
     } else if (use_rx) {
-        spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_RX, NULL, rx, rx_length, hint);
+        return spi_master_start_asynch_transfer(obj, SPI_TRANSFER_TYPE_RX, NULL, rx, rx_length, hint);
+    }
+    else {
+        return false;
     }
 }
 
@@ -1830,21 +1817,10 @@ void spi_abort_asynch(spi_t *obj)
     NVIC_ClearPendingIRQ(irq_n);
     NVIC_DisableIRQ(irq_n);
 
-#ifdef STM32_SPI_CAPABILITY_DMA
-    // Abort DMA transfers if DMA channels have been allocated
-    if(spiobj->txDMAInitialized) {
-        HAL_DMA_Abort_IT(spiobj->handle.hdmatx);
-    }
-    if(spiobj->rxDMAInitialized) {
-        HAL_DMA_Abort_IT(spiobj->handle.hdmarx);
-    }
-#endif
-
-    // clean-up
-    LL_SPI_Disable(SPI_INST(obj));
-    HAL_SPI_DeInit(handle);
-
-    init_spi(obj);
+    // Use HAL abort function.
+    // Conveniently, this is smart enough to automatically abort the DMA transfer
+    // if DMA was used.
+    HAL_SPI_Abort_IT(handle);
 }
 
 #endif //DEVICE_SPI_ASYNCH
diff --git a/targets/TARGET_STM/stm_spi_api.h b/targets/TARGET_STM/stm_spi_api.h
index 6fb22c93c4e..f34f7d17ef7 100644
--- a/targets/TARGET_STM/stm_spi_api.h
+++ b/targets/TARGET_STM/stm_spi_api.h
@@ -36,6 +36,7 @@ struct spi_s {
 #if DEVICE_SPI_ASYNCH
     uint32_t event;
     uint8_t transfer_type;
+    bool curr_transfer_uses_dma;
 
     // Callback function for when we get an interrupt on an async transfer.
     // This will point, through a bit of indirection, to SPI::irq_handler_asynch()
diff --git a/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c b/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
index c7a2b76d09d..02d44822d2d 100644
--- a/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
+++ b/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
@@ -1168,7 +1168,7 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
  * @param[in] handler   SPI interrupt handler
  * @param[in] hint      A suggestion for how to use DMA with this transfer
  */
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     if( spi_active(obj) ) return;
 
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
index bd98132b0b1..ddce123061a 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
@@ -409,7 +409,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #ifdef DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
                          uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *obj_s = SPI_S(obj);
@@ -455,6 +455,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
 
     SSP_Enable(spi);
     NVIC_EnableIRQ(obj_s->irqn);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c
index f8ebe48fa29..3e3cfe722e8 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM4G9/spi_api.c
@@ -484,7 +484,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #ifdef DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *obj_s = SPI_S(obj);
     tspi_t *p_obj = &(obj_s->p_obj);
@@ -535,6 +535,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
         NVIC_EnableIRQ(obj_s->rxirqn);
     }
     NVIC_EnableIRQ(obj_s->errirqn);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 
 uint32_t spi_irq_handler_asynch(spi_t *obj)
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c
index 7ebaef77823..1a997914003 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM4GR/spi_api.c
@@ -517,7 +517,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #if DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
                          uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
@@ -574,6 +574,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     NVIC_EnableIRQ(p_irqn->Error);
     NVIC_EnableIRQ(p_irqn->Tx);
     NVIC_EnableIRQ(p_irqn->Rx);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 uint32_t spi_irq_handler_asynch(spi_t *obj)
 {
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c
index dc958ba927e..f288f5b3987 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM4NR/spi_api.c
@@ -398,7 +398,7 @@ const PinMap *spi_slave_cs_pinmap()
 
 #if DEVICE_SPI_ASYNCH
 
-void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
+bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width,
                          uint32_t handler, uint32_t event, DMAUsage hint)
 {
     struct spi_s *spiobj = SPI_S(obj);
@@ -455,6 +455,8 @@ void spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     NVIC_EnableIRQ(p_irqn->Error);
     NVIC_EnableIRQ(p_irqn->Tx);
     NVIC_EnableIRQ(p_irqn->Rx);
+
+    return false; // Currently we always use interrupts, not DMA
 }
 uint32_t spi_irq_handler_asynch(spi_t *obj)
 {

From cfbb8104e2fe4d75ca94a9eb7bd4bb8d3c22be64 Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Mon, 4 Dec 2023 00:39:55 -0800
Subject: [PATCH 2/8] Fix build on cache-less devices

---
 platform/include/platform/CacheAlignedBuffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
index 61b6f09f6a0..97b5339a6a9 100644
--- a/platform/include/platform/CacheAlignedBuffer.h
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -57,7 +57,7 @@ CacheAlignedBuffer {
     constexpr static size_t requiredSizeRoundedUp =  (BufferSize * sizeof(DataT) + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
     constexpr static size_t backingBufferSizeBytes = requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
 #else
-    constexpr static size_t backingBufferSize = BufferSize * sizeof(DataT);
+    constexpr static size_t backingBufferSizeBytes = BufferSize * sizeof(DataT);
 #endif
 
     uint8_t _backingBuffer[backingBufferSizeBytes];

From b45357e3204880d4ee787ac30bd4a7c6207139de Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Mon, 4 Dec 2023 01:39:19 -0800
Subject: [PATCH 3/8] Fix a couple things I missed

---
 drivers/include/drivers/SPI.h                      |  9 +++++++--
 drivers/source/SPI.cpp                             |  8 ++++----
 targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c | 13 +++++++++++--
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/include/drivers/SPI.h b/drivers/include/drivers/SPI.h
index 2c330266883..c12cd381cd9 100644
--- a/drivers/include/drivers/SPI.h
+++ b/drivers/include/drivers/SPI.h
@@ -195,6 +195,11 @@ const use_gpio_ssel_t use_gpio_ssel;
  * the transfer but others can execute).  Here's a sample of how to send the same data as above
  * using the blocking async API:</p>
  *
+ * <p>Note that when using the asynchronous API, you must use the CacheAlignedBuffer class when declaring the
+ * receive buffer.  This is because some processors' async SPI implementations require the received buffer to
+ * be at an address which is aligned to the processor cache line size.  CacheAlignedBuffer takes care of this
+ * for you and provides functions (data(), begin(), end()) to access the underlying data in the buffer.</p>
+ *
  * @code
  * #include "mbed.h"
  *
@@ -204,8 +209,8 @@ const use_gpio_ssel_t use_gpio_ssel;
  *     device.format(8, 0);
  *
  *     uint8_t command[2] = {0x0A, 0x0B};
- *     uint8_t response[2];
- *     int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(response));
+ *     CacheAlignedBuffer<uint8_t, 2> response;
+ *     int result = device.transfer_and_wait(command, sizeof(command), response, sizeof(command));
  * }
  * @endcode
  *
diff --git a/drivers/source/SPI.cpp b/drivers/source/SPI.cpp
index af7368e7b93..9c6a56dcbb0 100644
--- a/drivers/source/SPI.cpp
+++ b/drivers/source/SPI.cpp
@@ -397,7 +397,7 @@ void SPI::abort_transfer()
         }
 
 #if __DCACHE_PRESENT
-        if(_transfer_in_progress_uses_dma)
+        if(_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0)
         {
             // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
             // We only want to do this if DMA actually got used for the transfer because, if interrupts
@@ -492,10 +492,10 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,
     }
 
     // Additionally, we have to make sure that there aren't any pending changes which could be written back
-    // to the Rx buffer memory by the cache at a later date, corrupting the DMA results
+    // to the Rx buffer memory by the cache at a later date, corrupting the DMA results.
     if(rx_length > 0)
     {
-        SCB_CleanInvalidateDCache_by_Addr(rx_buffer, rx_length);
+        SCB_InvalidateDCache_by_Addr(rx_buffer, rx_length);
     }
     _transfer_in_progress_rx_buffer = rx_buffer;
     _transfer_in_progress_rx_len = rx_length;
@@ -547,7 +547,7 @@ void SPI::irq_handler_asynch(void)
     if ((event & SPI_EVENT_ALL)) {
 
 #if __DCACHE_PRESENT
-        if(_transfer_in_progress_uses_dma)
+        if(_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0)
         {
             // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
             // We only want to do this if DMA actually got used for the transfer because, if interrupts
diff --git a/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c b/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
index 02d44822d2d..0a852b5f9cb 100644
--- a/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
+++ b/targets/TARGET_Silicon_Labs/TARGET_EFM32/spi_api.c
@@ -1116,8 +1116,10 @@ static void spi_activate_dma(spi_t *obj, void* rxdata, const void* txdata, int t
 *       * ALWAYS: use DMA if channels are available, and hold on to the channels after the transfer.
 *                 If the previous transfer has kept the channel, that channel will continue to get used.
 *
+*  Returns true if DMA was used, false otherwise.
+*
 ********************************************************************/
-void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int tx_length, int rx_length, void* cb, DMAUsage hint)
+bool spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int tx_length, int rx_length, void* cb, DMAUsage hint)
 {
     /* Init DMA here to include it in the power figure */
     dma_init();
@@ -1128,11 +1130,13 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
     if (hint != DMA_USAGE_NEVER && obj->spi.dmaOptionsTX.dmaUsageState == DMA_USAGE_ALLOCATED) {
         /* setup has already been done, so just activate the transfer */
         spi_activate_dma(obj, rxdata, txdata, tx_length, rx_length);
+        return true;
     } else if (hint == DMA_USAGE_NEVER) {
         /* use IRQ */
         obj->spi.spi->IFC = 0xFFFFFFFF;
         spi_master_write_asynch(obj);
         spi_enable_interrupt(obj, (uint32_t)cb, true);
+        return false;
     } else {
         /* try to acquire channels */
         dma_init();
@@ -1147,11 +1151,13 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
             spi_master_dma_channel_setup(obj, cb);
             /* and activate the transfer */
             spi_activate_dma(obj, rxdata, txdata, tx_length, rx_length);
+            return true;
         } else {
             /* DMA is unavailable, so fall back to IRQ */
             obj->spi.spi->IFC = 0xFFFFFFFF;
             spi_master_write_asynch(obj);
             spi_enable_interrupt(obj, (uint32_t)cb, true);
+            return false;
         }
     }
 }
@@ -1167,6 +1173,9 @@ void spi_master_transfer_dma(spi_t *obj, const void *txdata, void *rxdata, int t
  * @param[in] event     The logical OR of events to be registered
  * @param[in] handler   SPI interrupt handler
  * @param[in] hint      A suggestion for how to use DMA with this transfer
+ *
+ * @return True if DMA was actually used for the transfer, false otherwise (if interrupts or another CPU-based
+ * method is used to do the transfer).
  */
 bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx, size_t rx_length, uint8_t bit_width, uint32_t handler, uint32_t event, DMAUsage hint)
 {
@@ -1190,7 +1199,7 @@ bool spi_master_transfer(spi_t *obj, const void *tx, size_t tx_length, void *rx,
     spi_enable_event(obj, event, true);
 
     /* And kick off the transfer */
-    spi_master_transfer_dma(obj, tx, rx, tx_length, rx_length, (void*)handler, hint);
+    return spi_master_transfer_dma(obj, tx, rx, tx_length, rx_length, (void*)handler, hint);
 }
 
 

From 86dbd364d35996ee97a45d0b84019456f38e10ad Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Mon, 4 Dec 2023 01:56:33 -0800
Subject: [PATCH 4/8] Run formatter, improve CacheAlignedBuffer docs

---
 drivers/include/drivers/SPI.h                 | 10 +--
 drivers/source/SPI.cpp                        | 17 ++---
 .../include/platform/CacheAlignedBuffer.h     | 74 +++++++++++++------
 .../COMPONENT_SD/source/SDBlockDevice.cpp     |  8 +-
 4 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/drivers/include/drivers/SPI.h b/drivers/include/drivers/SPI.h
index c12cd381cd9..59f5f8befb7 100644
--- a/drivers/include/drivers/SPI.h
+++ b/drivers/include/drivers/SPI.h
@@ -478,7 +478,7 @@ class SPI : private NonCopyable<SPI> {
      */
     template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
         return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
     }
@@ -486,7 +486,7 @@ class SPI : private NonCopyable<SPI> {
     // Overloads of the above to support passing nullptr
     template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
         return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
     }
@@ -524,7 +524,7 @@ class SPI : private NonCopyable<SPI> {
      */
     template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
         return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
@@ -532,7 +532,7 @@ class SPI : private NonCopyable<SPI> {
     // Overloads of the above to support passing nullptr
     template<typename WordT, size_t RxBufferLen>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> & rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
         return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
@@ -741,7 +741,7 @@ class SPI : private NonCopyable<SPI> {
 #if __DCACHE_PRESENT
     // These variables store the location and length in bytes of the Rx buffer if an async transfer
     // is in progress.  They are used for invalidating the cache after the transfer completes.
-    void * _transfer_in_progress_rx_buffer;
+    void *_transfer_in_progress_rx_buffer;
     size_t _transfer_in_progress_rx_len;
 #endif
 
diff --git a/drivers/source/SPI.cpp b/drivers/source/SPI.cpp
index 9c6a56dcbb0..a0339487215 100644
--- a/drivers/source/SPI.cpp
+++ b/drivers/source/SPI.cpp
@@ -397,8 +397,7 @@ void SPI::abort_transfer()
         }
 
 #if __DCACHE_PRESENT
-        if(_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0)
-        {
+        if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
             // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
             // We only want to do this if DMA actually got used for the transfer because, if interrupts
             // were used instead, the cache might have the correct data and NOT the main memory.
@@ -484,17 +483,15 @@ void SPI::start_transfer(const void *tx_buffer, int tx_length, void *rx_buffer,
     // not interoperate with the CPU cache.  So, manual flushing/invalidation will be required.
     // This page is very useful for how to do this correctly:
     // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
-    if(tx_length > 0)
-    {
+    if (tx_length > 0) {
         // For chips with a cache, we need to evict the Tx data from cache to main memory.
         // This ensures that the DMA controller can see the most up-to-date copy of the data.
-        SCB_CleanDCache_by_Addr(const_cast<void*>(tx_buffer), tx_length);
+        SCB_CleanDCache_by_Addr(const_cast<void *>(tx_buffer), tx_length);
     }
 
     // Additionally, we have to make sure that there aren't any pending changes which could be written back
     // to the Rx buffer memory by the cache at a later date, corrupting the DMA results.
-    if(rx_length > 0)
-    {
+    if (rx_length > 0) {
         SCB_InvalidateDCache_by_Addr(rx_buffer, rx_length);
     }
     _transfer_in_progress_rx_buffer = rx_buffer;
@@ -547,8 +544,7 @@ void SPI::irq_handler_asynch(void)
     if ((event & SPI_EVENT_ALL)) {
 
 #if __DCACHE_PRESENT
-        if(_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0)
-        {
+        if (_transfer_in_progress_uses_dma && _transfer_in_progress_rx_len > 0) {
             // If the cache is present, invalidate the Rx data so it's loaded from main RAM.
             // We only want to do this if DMA actually got used for the transfer because, if interrupts
             // were used instead, the cache might have the correct data and NOT the main memory.
@@ -566,8 +562,7 @@ void SPI::irq_handler_asynch(void)
 
         unlock_deep_sleep();
 
-        if(_callback)
-        {
+        if (_callback) {
             _callback.call(event & SPI_EVENT_ALL);
         }
     }
diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
index 97b5339a6a9..6e8e56a66f3 100644
--- a/platform/include/platform/CacheAlignedBuffer.h
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -28,8 +28,11 @@ namespace mbed {
 /**
  * @brief CacheAlignedBuffer is used by Mbed in locations where we need a cache-aligned buffer.
  *
- * Cache alignment is desirable in several different situations in embedded programming; often
- * when working with DMA or other peripherals which write their results back to main memory.
+ * Cache alignment is desirable in several different situations in embedded programming -- one common
+ * use is when working with DMA or other peripherals which write their results back to main memory.
+ * After these peripherals do their work, the data will be correct in main memory, but the CPU cache
+ * might also contain a value cached from that memory which is now incorrect.
+ *
  * In order to read those results from memory without risk of getting old data from the
  * CPU cache, one needs to align the buffer so it takes up an integer number of cache lines,
  * then invalidate the cache lines so that the data gets reread from RAM.
@@ -43,7 +46,7 @@ namespace mbed {
  */
 template<typename DataT, size_t BufferSize>
 struct
-CacheAlignedBuffer {
+    CacheAlignedBuffer {
 private:
 #if __DCACHE_PRESENT
     // Allocate enough extra space that we can shift the start of the buffer forward to be on a cache line.
@@ -54,14 +57,14 @@ CacheAlignedBuffer {
     // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
     // The math for rounding up can be found here:
     // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
-    constexpr static size_t requiredSizeRoundedUp =  (BufferSize * sizeof(DataT) + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
+    constexpr static size_t requiredSizeRoundedUp = (BufferSize *sizeof(DataT) + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
     constexpr static size_t backingBufferSizeBytes = requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
 #else
     constexpr static size_t backingBufferSizeBytes = BufferSize * sizeof(DataT);
 #endif
 
     uint8_t _backingBuffer[backingBufferSizeBytes];
-    DataT * _alignedArrayPtr;
+    DataT *_alignedArrayPtr;
 
     /**
      * Find and return the first location in the given buffer that starts on a cache line.
@@ -71,7 +74,7 @@ CacheAlignedBuffer {
      *
      * @return Pointer to first data item, aligned at the start of a cache line.
      */
-    inline DataT * findCacheLineStart(uint8_t * buffer)
+    inline DataT *findCacheLineStart(uint8_t *buffer)
     {
 #if __DCACHE_PRESENT
         // Use integer division to divide the address down to the cache line size, which
@@ -90,22 +93,22 @@ CacheAlignedBuffer {
 public:
 
     // Iterator types
-    typedef DataT * iterator;
-    typedef DataT const * const_iterator;
+    typedef DataT *iterator;
+    typedef DataT const *const_iterator;
 
     /**
      * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized.
      */
     CacheAlignedBuffer():
-    _backingBuffer{},
-    _alignedArrayPtr(findCacheLineStart(_backingBuffer))
+        _backingBuffer{},
+        _alignedArrayPtr(findCacheLineStart(_backingBuffer))
     {}
 
     /**
      * @brief Copy from other cache-aligned buffer.  Buffer memory will be copied.
      */
-    CacheAlignedBuffer(CacheAlignedBuffer const & other):
-    _alignedArrayPtr(findCacheLineStart(_backingBuffer))
+    CacheAlignedBuffer(CacheAlignedBuffer const &other):
+        _alignedArrayPtr(findCacheLineStart(_backingBuffer))
     {
         memcpy(this->_alignedArrayPtr, other._alignedArrayPtr, BufferSize * sizeof(DataT));
     }
@@ -113,7 +116,7 @@ CacheAlignedBuffer {
     /**
      * @brief Assign from other cache-aligned buffer.  Buffer memory will be assigned.
      */
-    CacheAlignedBuffer & operator=(CacheAlignedBuffer const & other)
+    CacheAlignedBuffer &operator=(CacheAlignedBuffer const &other)
     {
         memcpy(this->_alignedArrayPtr, other._alignedArrayPtr, BufferSize * sizeof(DataT));
     }
@@ -121,47 +124,74 @@ CacheAlignedBuffer {
     /**
      * @brief Get a pointer to the aligned data array inside the buffer
      */
-    DataT * data() { return _alignedArrayPtr; }
+    DataT *data()
+    {
+        return _alignedArrayPtr;
+    }
 
     /**
      * @brief Get a pointer to the aligned data array inside the buffer (const version)
      */
-    DataT const * data() const { return _alignedArrayPtr; }
+    DataT const *data() const
+    {
+        return _alignedArrayPtr;
+    }
 
     /**
      * @brief Element access
      */
-    DataT & operator[](size_t index) { return _alignedArrayPtr[index]; }
+    DataT &operator[](size_t index)
+    {
+        return _alignedArrayPtr[index];
+    }
 
     /**
      * @brief Element access (const)
      */
-    DataT operator[](size_t index) const { return _alignedArrayPtr[index]; }
+    DataT operator[](size_t index) const
+    {
+        return _alignedArrayPtr[index];
+    }
 
     /**
      * @brief Get iterator for start of buffer
      */
-    iterator begin() { return _alignedArrayPtr; }
+    iterator begin()
+    {
+        return _alignedArrayPtr;
+    }
 
     /**
      * @brief Get iterator for start of buffer
      */
-    const_iterator begin() const { return _alignedArrayPtr; }
+    const_iterator begin() const
+    {
+        return _alignedArrayPtr;
+    }
 
     /**
      * @brief Get iterator for end of buffer
      */
-    iterator end() { return _alignedArrayPtr + BufferSize; }
+    iterator end()
+    {
+        return _alignedArrayPtr + BufferSize;
+    }
 
     /**
      * @brief Get iterator for end of buffer
      */
-    const_iterator end() const { return _alignedArrayPtr + BufferSize; }
+    const_iterator end() const
+    {
+        return _alignedArrayPtr + BufferSize;
+    }
 
     /**
      * @return The maximum amount of DataT elements that this buffer can hold
      */
-    constexpr size_t capacity() { return BufferSize; }
+    constexpr size_t capacity()
+    {
+        return BufferSize;
+    }
 
     /**
      * @brief If this MCU has a data cache, this function _invalidates_ the buffer in the data cache.
diff --git a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
index 8e9ac5fb902..21d5bb8657f 100644
--- a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
+++ b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
@@ -921,8 +921,7 @@ int SDBlockDevice::_read_bytes(uint8_t *buffer, uint32_t length)
     // read data
 #if DEVICE_SPI_ASYNCH
     if (_async_spi_enabled) {
-        if(length > _async_data_buffer.capacity())
-        {
+        if (length > _async_data_buffer.capacity()) {
             return SD_BLOCK_DEVICE_ERROR_PARAMETER;
         }
 
@@ -973,8 +972,7 @@ int SDBlockDevice::_read(uint8_t *buffer, uint32_t length)
     // read data
 #if DEVICE_SPI_ASYNCH
     if (_async_spi_enabled) {
-        if(length > _async_data_buffer.capacity())
-        {
+        if (length > _async_data_buffer.capacity()) {
             return SD_BLOCK_DEVICE_ERROR_PARAMETER;
         }
 
@@ -1081,7 +1079,7 @@ bd_size_t SDBlockDevice::_sd_sectors()
     }
 #ifdef DEVICE_SPI_ASYNCH
     CacheAlignedBuffer<uint8_t, 16> csd_buffer;
-    uint8_t * csd = csd_buffer.data();
+    uint8_t *csd = csd_buffer.data();
 #else
     uint8_t csd[16];
 #endif

From 1f2a9feb4e73059ce67bf731fadeef190b6385e2 Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Wed, 6 Dec 2023 19:01:52 -0800
Subject: [PATCH 5/8] Add missing license identifiers to source files

---
 targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c | 1 +
 targets/TARGET_NUVOTON/TARGET_M451/spi_api.c              | 1 +
 targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c           | 1 +
 targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c            | 1 +
 targets/TARGET_STM/stm_spi_api.c                          | 1 +
 targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c           | 1 +
 6 files changed, 6 insertions(+)

diff --git a/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c b/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
index 77b0e0a70f9..56b2600ed9b 100644
--- a/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
+++ b/targets/TARGET_NORDIC/TARGET_NRF5x/TARGET_NRF52/spi_api.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2017 Nordic Semiconductor ASA
  * All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-Nordic-5-Clause
  *
  * Redistribution and use in source and binary forms, with or without modification,
  * are permitted provided that the following conditions are met:
diff --git a/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c b/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
index 2e80b1fb691..a43b8756960 100644
--- a/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_M451/spi_api.c
@@ -1,5 +1,6 @@
 /* mbed Microcontroller Library
  * Copyright (c) 2015-2016 Nuvoton
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c b/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
index c546862fc93..fe8c4e79e61 100644
--- a/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_NANO100/spi_api.c
@@ -1,5 +1,6 @@
 /* mbed Microcontroller Library
  * Copyright (c) 2015-2017 Nuvoton
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c b/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
index 2c154dff563..6ce546a7967 100644
--- a/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
+++ b/targets/TARGET_NUVOTON/TARGET_NUC472/spi_api.c
@@ -1,5 +1,6 @@
 /* mbed Microcontroller Library
  * Copyright (c) 2015-2016 Nuvoton
+ * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/targets/TARGET_STM/stm_spi_api.c b/targets/TARGET_STM/stm_spi_api.c
index 65d65e059cf..d5c49fbcd08 100644
--- a/targets/TARGET_STM/stm_spi_api.c
+++ b/targets/TARGET_STM/stm_spi_api.c
@@ -2,6 +2,7 @@
  *******************************************************************************
  * Copyright (c) 2015, STMicroelectronics
  * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c b/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
index ddce123061a..9d8b846e582 100644
--- a/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
+++ b/targets/TARGET_TOSHIBA/TARGET_TMPM46B/spi_api.c
@@ -2,6 +2,7 @@
  *******************************************************************************
  * (C)Copyright TOSHIBA ELECTRONIC DEVICES & STORAGE CORPORATION 2017 All rights reserved
  * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:

From 29e76a7e71432a09ec53ae85dc9e3ff17537154a Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Tue, 12 Dec 2023 23:22:48 -0800
Subject: [PATCH 6/8] Make CacheAlignedBuffer heap-allocatable, try and add
 exclusion for Nordic license in scancode_evaluate.py

---
 drivers/include/drivers/SPI.h                 |  20 +-
 .../include/platform/CacheAlignedBuffer.h     | 260 +++++++++++++-----
 .../COMPONENT_SD/include/SD/SDBlockDevice.h   |   2 +-
 .../COMPONENT_SD/source/SDBlockDevice.cpp     |   2 +-
 tools/python/run_python_tests.sh              |   2 +-
 .../scancode_evaluate/scancode_evaluate.py    |  32 ++-
 6 files changed, 230 insertions(+), 88 deletions(-)

diff --git a/drivers/include/drivers/SPI.h b/drivers/include/drivers/SPI.h
index 59f5f8befb7..58e79f9f16d 100644
--- a/drivers/include/drivers/SPI.h
+++ b/drivers/include/drivers/SPI.h
@@ -476,18 +476,20 @@ class SPI : private NonCopyable<SPI> {
      * @retval 0 If the transfer has started.
      * @retval -1 if the transfer could not be enqueued (increase drivers.spi_transaction_queue_len option)
      */
-    template<typename WordT, size_t RxBufferLen>
+    template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
         return transfer_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, callback, event);
     }
 
     // Overloads of the above to support passing nullptr
-    template<typename WordT, size_t RxBufferLen>
+    template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
+    transfer(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, const event_callback_t &callback, int event = SPI_EVENT_COMPLETE)
     {
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
         return transfer_internal(tx_buffer, tx_length, rx_buffer, rx_length, callback, event);
     }
     template<typename WordT>
@@ -522,18 +524,20 @@ class SPI : private NonCopyable<SPI> {
      * @retval 2 on other error
      * @retval 0 on success
      */
-    template<typename WordT, size_t RxBufferLen>
+    template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const WordT *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
         return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
 
     // Overloads of the above to support passing nullptr
-    template<typename WordT, size_t RxBufferLen>
+    template<typename WordT>
     typename std::enable_if<std::is_integral<WordT>::value, int>::type
-    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT, RxBufferLen> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
+    transfer_and_wait(const std::nullptr_t *tx_buffer, int tx_length, CacheAlignedBuffer<WordT> &rx_buffer, int rx_length, rtos::Kernel::Clock::duration_u32 timeout = rtos::Kernel::wait_for_u32_forever)
     {
+        MBED_ASSERT(rx_length <= static_cast<int>(rx_buffer.capacity()));
         return transfer_and_wait_internal(tx_buffer, tx_length, rx_buffer.data(), rx_length, timeout);
     }
     template<typename WordT>
diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
index 6e8e56a66f3..777d5aadb33 100644
--- a/platform/include/platform/CacheAlignedBuffer.h
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -24,47 +24,90 @@
 
 namespace mbed {
 
+namespace detail::cab {
+    /**
+     * @brief Calculate the needed capacity for a cache aligned buffer's backing buffer based on the
+     *    needed capacity and element size.
+     *
+     * @param neededCapacity Capacity needed for the buffer
+     * @param elementSize Size of each element
+     *
+     * @return Needed backing buffer size
+     */
+    constexpr inline size_t getNeededBackingBufferSize(size_t neededCapacity, size_t elementSize) {
+#if __DCACHE_PRESENT
+        // Allocate enough extra space that we can shift the start of the buffer towards higher addresses to be on a cache line.
+        // The worst case for this is when the first byte is allocated 1 byte past the start of a cache line, so we
+        // will need an additional (cache line size - 1) bytes.
+        // Additionally, since we are going to be invalidating this buffer, we can't allow any other variables to be
+        // in the same cache lines, or they might get corrupted.
+        // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
+        // The math for rounding up can be found here:
+        // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+        size_t requiredSizeRoundedUp = (neededCapacity * elementSize + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
+        return requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
+#else
+        // No cache on this platform so don't need any extra space.
+        return neededCapacity * elementSize;
+#endif
+    }
+}
 
 /**
  * @brief CacheAlignedBuffer is used by Mbed in locations where we need a cache-aligned buffer.
  *
- * Cache alignment is desirable in several different situations in embedded programming -- one common
+ * <p>Cache alignment is desirable in several different situations in embedded programming -- one common
  * use is when working with DMA or other peripherals which write their results back to main memory.
  * After these peripherals do their work, the data will be correct in main memory, but the CPU cache
- * might also contain a value cached from that memory which is now incorrect.
+ * might also contain a value cached from that memory which is now incorrect.<p>
  *
- * In order to read those results from memory without risk of getting old data from the
+ * <p>In order to read those results from memory without risk of getting old data from the
  * CPU cache, one needs to align the buffer so it takes up an integer number of cache lines,
- * then invalidate the cache lines so that the data gets reread from RAM.
+ * then invalidate the cache lines so that the data gets reread from RAM.</p>
  *
- * CacheAlignedBuffer provides an easy way to allocate the correct amount of space so that
- * a buffer of any size can be made cache-aligned.
+ * <p>%CacheAlignedBuffer provides an easy way to allocate the correct amount of space so that
+ * a buffer of any size can be made cache-aligned.  To instantiate a %CacheAlignedBuffer, create one of its
+ * subtypes, #StaticCacheAlignedBuffer or #DynamicCacheAlignedBuffer.</p>
+ *
+ * <h2> Converting Code to use CacheAlignedBuffer </h2>
+ * For code using static arrays, like this:
+ * \code{.cpp}
+ * uint8_t rxBuffer[16];
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, 16);
+ * \endcode
+ * Use a StaticCacheAlignedBuffer:
+ * \code{.cpp}
+ * StaticCacheAlignedBuffer<uint8_t, 16> rxBuffer;
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, 16);
+ * \endcode
+ * For code using dynamic allocation to handle unknown buffer sizes, like:
+ * \code{.cpp}
+ * uint16_t * rxBuffer = new uint16_t[bufferSize];
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, bufferSize);
+ * ...
+ * delete[] rxBuffer;
+ * \endcode
+ * use a DynamicCacheAlignedBuffer:
+ * \code{.cpp}
+ * DynamicCacheAlignedBuffer<uint16_t> rxBuffer(bufferSize);
+ * spi.transfer_and_wait(nullptr, 0, rxBuffer, bufferSize);
+ * \endcode
  *
  * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
  *    using class types as DataT, and will not call constructors.
- * @tparam BufferSize Buffer size (number of elements) needed by the application for the buffer.
  */
-template<typename DataT, size_t BufferSize>
-struct
-    CacheAlignedBuffer {
-private:
-#if __DCACHE_PRESENT
-    // Allocate enough extra space that we can shift the start of the buffer forward to be on a cache line.
-    // The worst case for this is when the first byte is allocated 1 byte past the start of a cache line, so we
-    // will need an additional (cache line size - 1) bytes.
-    // Additionally, since we are going to be invalidating this buffer, we can't allow any other variables to be
-    // in the same cache lines, or they might get corrupted.
-    // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
-    // The math for rounding up can be found here:
-    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
-    constexpr static size_t requiredSizeRoundedUp = (BufferSize *sizeof(DataT) + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
-    constexpr static size_t backingBufferSizeBytes = requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
-#else
-    constexpr static size_t backingBufferSizeBytes = BufferSize * sizeof(DataT);
-#endif
+template<typename DataT>
+class CacheAlignedBuffer {
+
+protected:
+    /// Pointer to the aligned buffer.  Must be set in each constructor of each subclass.
+    DataT * _alignedBufferPtr;
+
+    /// Capacity of the aligned buffer, in terms of number of DataT elements
+    size_t _alignedBufferCapacity;
 
-    uint8_t _backingBuffer[backingBufferSizeBytes];
-    DataT *_alignedArrayPtr;
+    // Protected constructor to block instantiation
+    CacheAlignedBuffer() = default;
 
     /**
      * Find and return the first location in the given buffer that starts on a cache line.
@@ -74,7 +117,7 @@ struct
      *
      * @return Pointer to first data item, aligned at the start of a cache line.
      */
-    inline DataT *findCacheLineStart(uint8_t *buffer)
+    static inline DataT *findCacheLineStart(uint8_t *buffer)
     {
 #if __DCACHE_PRESENT
         // Use integer division to divide the address down to the cache line size, which
@@ -96,37 +139,12 @@ struct
     typedef DataT *iterator;
     typedef DataT const *const_iterator;
 
-    /**
-     * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized.
-     */
-    CacheAlignedBuffer():
-        _backingBuffer{},
-        _alignedArrayPtr(findCacheLineStart(_backingBuffer))
-    {}
-
-    /**
-     * @brief Copy from other cache-aligned buffer.  Buffer memory will be copied.
-     */
-    CacheAlignedBuffer(CacheAlignedBuffer const &other):
-        _alignedArrayPtr(findCacheLineStart(_backingBuffer))
-    {
-        memcpy(this->_alignedArrayPtr, other._alignedArrayPtr, BufferSize * sizeof(DataT));
-    }
-
-    /**
-     * @brief Assign from other cache-aligned buffer.  Buffer memory will be assigned.
-     */
-    CacheAlignedBuffer &operator=(CacheAlignedBuffer const &other)
-    {
-        memcpy(this->_alignedArrayPtr, other._alignedArrayPtr, BufferSize * sizeof(DataT));
-    }
-
     /**
      * @brief Get a pointer to the aligned data array inside the buffer
      */
     DataT *data()
     {
-        return _alignedArrayPtr;
+        return _alignedBufferPtr;
     }
 
     /**
@@ -134,7 +152,7 @@ struct
      */
     DataT const *data() const
     {
-        return _alignedArrayPtr;
+        return _alignedBufferPtr;
     }
 
     /**
@@ -142,7 +160,7 @@ struct
      */
     DataT &operator[](size_t index)
     {
-        return _alignedArrayPtr[index];
+        return _alignedBufferPtr[index];
     }
 
     /**
@@ -150,7 +168,7 @@ struct
      */
     DataT operator[](size_t index) const
     {
-        return _alignedArrayPtr[index];
+        return _alignedBufferPtr[index];
     }
 
     /**
@@ -158,7 +176,7 @@ struct
      */
     iterator begin()
     {
-        return _alignedArrayPtr;
+        return _alignedBufferPtr;
     }
 
     /**
@@ -166,7 +184,7 @@ struct
      */
     const_iterator begin() const
     {
-        return _alignedArrayPtr;
+        return _alignedBufferPtr;
     }
 
     /**
@@ -174,7 +192,7 @@ struct
      */
     iterator end()
     {
-        return _alignedArrayPtr + BufferSize;
+        return _alignedBufferPtr + _alignedBufferCapacity;
     }
 
     /**
@@ -182,29 +200,129 @@ struct
      */
     const_iterator end() const
     {
-        return _alignedArrayPtr + BufferSize;
+        return _alignedBufferPtr + _alignedBufferCapacity;
     }
 
     /**
-     * @return The maximum amount of DataT elements that this buffer can hold
+     * @return The maximum amount of DataT elements that this buffer can hold.
      */
     constexpr size_t capacity()
     {
-        return BufferSize;
+        return _alignedBufferCapacity;
+    }
+};
+
+/**
+ * @brief CacheAlignedBuffer type designed for static allocation.
+ *
+ * Use a StaticCacheAlignedBuffer when you want to create a cache-aligned buffer with a fixed size
+ * at compile time.  %StaticCacheAlignedBuffers can be declared globally, as local variables, or using
+ * new[] and delete[].
+ *
+ * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
+ *    using class types as DataT, and will not call constructors.
+ * @tparam BufferSize Buffer size (number of elements) needed by the application for the buffer.
+ */
+template<typename DataT, size_t BufferSize>
+class StaticCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
+private:
+
+    uint8_t _backingBuffer[detail::cab::getNeededBackingBufferSize(BufferSize, sizeof(DataT))];
+
+public:
+
+    /**
+     * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized.
+     */
+    StaticCacheAlignedBuffer():
+        _backingBuffer{}
+    {
+        this->_alignedBufferPtr = this->findCacheLineStart(_backingBuffer);
+        this->_alignedBufferCapacity = BufferSize;
     }
 
     /**
-     * @brief If this MCU has a data cache, this function _invalidates_ the buffer in the data cache.
+     * @brief Copy from other cache-aligned buffer.  Buffer memory will be copied.
+     */
+    StaticCacheAlignedBuffer(StaticCacheAlignedBuffer const &other)
+    {
+        this->_alignedBufferPtr = this->findCacheLineStart(_backingBuffer);
+        memcpy(this->_alignedBufferPtr, other._alignedBufferPtr, BufferSize * sizeof(DataT));
+        this->_alignedBufferCapacity = BufferSize;
+    }
+
+    /**
+     * @brief Assign from other cache-aligned buffer.  Buffer memory will be assigned.
      *
-     * Invalidation means that the next time code access the buffer data, it is guaranteed to be fetched from
-     * main memory rather than from the CPU cache.  If there are changes made to the data which have not been
-     * flushed back to main memory, those changes will be lost!
+     * Only a buffer with the same data type and size can be assigned.
      */
-    void invalidate()
+    StaticCacheAlignedBuffer &operator=(StaticCacheAlignedBuffer<DataT, BufferSize> const &other)
     {
-#if __DCACHE_PRESENT
-        SCB_InvalidateDCache_by_Addr(_alignedArrayPtr, BufferSize * sizeof(DataT));
-#endif
+        memmove(this->_alignedBufferPtr, other._alignedBufferPtr, BufferSize * sizeof(DataT));
+    }
+};
+
+/**
+ * @brief CacheAlignedBuffer type which allocates its backing buffer on the heap.
+ *
+ * Use a DynamicCacheAlignedBuffer when you want to create a cache-aligned buffer with a size
+ * known only at runtime.  When constructed, %DynamicCacheAlignedBuffers allocate backing memory off the
+ * heap for the provided number of elements.  The memory will be released when the buffer object is destroyed.
+ *
+ * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
+ *    using class types as DataT, and will not call constructors.
+ */
+template<typename DataT>
+class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
+    uint8_t * _heapMem;
+public:
+    /**
+     * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized and allocated from the heap.
+     *
+     * @param capacity Number of elements the buffer shall hold
+     */
+    explicit DynamicCacheAlignedBuffer(size_t capacity):
+            _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(capacity, sizeof(DataT))]())
+    {
+        this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
+        this->_alignedBufferCapacity = capacity;
+    }
+
+    /**
+     * @brief Copy from other cache-aligned buffer.  A new backing buffer will be allocated on the heap and
+     * its data will be copied from the other buffer.
+     */
+    DynamicCacheAlignedBuffer(DynamicCacheAlignedBuffer const &other):
+            _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(other._alignedBufferCapacity, sizeof(DataT))])
+    {
+        this->_alignedBufferCapacity = other._alignedBufferCapacity;
+        this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
+        memcpy(this->_alignedBufferPtr, other._alignedBufferPtr, this->_alignedBufferCapacity * sizeof(DataT));
+    }
+
+    // Destructor
+    ~DynamicCacheAlignedBuffer()
+    {
+        delete[] this->_heapMem;
+    }
+
+    /**
+     * @brief Assign from other cache-aligned buffer with the same type.  A new buffer will be allocated
+     * of the correct size.
+     */
+    DynamicCacheAlignedBuffer &operator=(DynamicCacheAlignedBuffer const &other)
+    {
+        // Check for self assignment
+        if(&other == this)
+        {
+            return *this;
+        }
+
+        delete[] _heapMem;
+        _heapMem = new uint8_t[detail::cab::getNeededBackingBufferSize(other._alignedBufferCapacity, sizeof(DataT))];
+        this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
+
+        memcpy(this->_alignedBufferPtr, other._alignedBufferPtr, this->_alignedBufferCapacity * sizeof(DataT));
     }
 };
 
diff --git a/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h b/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
index 246fed59538..e5aafcf609f 100644
--- a/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
+++ b/storage/blockdevice/COMPONENT_SD/include/SD/SDBlockDevice.h
@@ -317,7 +317,7 @@ class SDBlockDevice : public mbed::BlockDevice {
 
 #if DEVICE_SPI_ASYNCH
     bool _async_spi_enabled = false;
-    mbed::CacheAlignedBuffer<uint8_t, _block_size> _async_data_buffer;
+    mbed::StaticCacheAlignedBuffer<uint8_t, _block_size> _async_data_buffer;
 #endif
 };
 
diff --git a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
index 21d5bb8657f..73569fb6fbf 100644
--- a/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
+++ b/storage/blockdevice/COMPONENT_SD/source/SDBlockDevice.cpp
@@ -1078,7 +1078,7 @@ bd_size_t SDBlockDevice::_sd_sectors()
         return 0;
     }
 #ifdef DEVICE_SPI_ASYNCH
-    CacheAlignedBuffer<uint8_t, 16> csd_buffer;
+    StaticCacheAlignedBuffer<uint8_t, 16> csd_buffer;
     uint8_t *csd = csd_buffer.data();
 #else
     uint8_t csd[16];
diff --git a/tools/python/run_python_tests.sh b/tools/python/run_python_tests.sh
index b398106635e..208bd8b6cd7 100755
--- a/tools/python/run_python_tests.sh
+++ b/tools/python/run_python_tests.sh
@@ -11,7 +11,7 @@
 
 set -e
 
-PYTHON=python3
+PYTHON=python
 
 # Comma separated list of directories to exclude from coverage
 COVERAGE_EXCLUDES='--omit=python_tests/*'
diff --git a/tools/python/scancode_evaluate/scancode_evaluate.py b/tools/python/scancode_evaluate/scancode_evaluate.py
index 3cf24457f9d..3e1ebeb91df 100644
--- a/tools/python/scancode_evaluate/scancode_evaluate.py
+++ b/tools/python/scancode_evaluate/scancode_evaluate.py
@@ -72,14 +72,30 @@ def has_spdx_text_in_scancode_output(scancode_output_data_file_licenses):
     )
 
 
+SPDX_LICENSE_REGEX = re.compile(r"SPDX-License-Identifier: *(\S+)")
+
+
 def has_spdx_text_in_analysed_file(scanned_file_content):
     """Returns true if the file analysed by ScanCode contains SPDX identifier."""
-    return bool(re.findall("SPDX-License-Identifier:?", scanned_file_content))
+    return bool(SPDX_LICENSE_REGEX.findall(scanned_file_content))
 
-
-def has_binary_license(scanned_file_content):
-    """Returns true if the file analysed by ScanCode contains a Permissive Binary License."""
-    return bool(re.findall("Permissive Binary License", scanned_file_content))
+def has_exempted_spdx_identifier(scanned_file_content):
+    """
+    Returns true if the file analysed by scancode contains an exempted SPDX identifier.
+    (this is used for licenses like Nordic BSD which are not technically permissive licenses according
+    to SPDX but which are still OK for us)
+    """
+    spdx_find_result = SPDX_LICENSE_REGEX.findall(scanned_file_content)
+    if len(spdx_find_result) == 1:
+        if spdx_find_result[0] == "LicenseRef-Nordic-5-Clause":
+            return True
+        if spdx_find_result[0] == "LicenseRef-PBL":
+            return True
+        else:
+            return False
+    else:
+        # Either 0 or multiple matches, ignore
+        return False
 
 
 def get_file_text(scancode_output_data_file):
@@ -94,6 +110,7 @@ def get_file_text(scancode_output_data_file):
     except UnicodeDecodeError:
         userlog.warning("Unable to decode file text in: %s" % file_path)
         # Ignore files that cannot be decoded
+        return None
 
 
 def license_check(scancode_output_path):
@@ -135,7 +152,10 @@ def license_check(scancode_output_path):
 
         if not has_permissive_text_in_scancode_output(scancode_output_data_file['licenses']):
             scanned_file_content = get_file_text(scancode_output_data_file)
-            if not (scanned_file_content and has_binary_license(scanned_file_content)):
+            if (scanned_file_content is None
+                or has_exempted_spdx_identifier(scanned_file_content)):
+                continue
+            else:
                 scancode_output_data_file['fail_reason'] = MISSING_PERMISSIVE_LICENSE_TEXT
                 license_offenders.append(scancode_output_data_file)
 

From 3ec38f6cf7d6c146c27c5518c1811bcb2fa0f660 Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Tue, 12 Dec 2023 23:59:01 -0800
Subject: [PATCH 7/8] Formatting, docs, revert accidental change

---
 .../include/platform/CacheAlignedBuffer.h     | 69 +++++++++++--------
 tools/python/run_python_tests.sh              |  2 +-
 2 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
index 777d5aadb33..72477d14cff 100644
--- a/platform/include/platform/CacheAlignedBuffer.h
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -25,34 +25,42 @@
 namespace mbed {
 
 namespace detail::cab {
-    /**
-     * @brief Calculate the needed capacity for a cache aligned buffer's backing buffer based on the
-     *    needed capacity and element size.
-     *
-     * @param neededCapacity Capacity needed for the buffer
-     * @param elementSize Size of each element
-     *
-     * @return Needed backing buffer size
-     */
-    constexpr inline size_t getNeededBackingBufferSize(size_t neededCapacity, size_t elementSize) {
+/**
+ * @brief Calculate the needed capacity for a cache aligned buffer's backing buffer based on the
+ *    needed capacity and element size.
+ *
+ * @param neededCapacity Capacity needed for the buffer
+ * @param elementSize Size of each element
+ *
+ * @return Needed backing buffer size
+ */
+constexpr inline size_t getNeededBackingBufferSize(size_t neededCapacity, size_t elementSize)
+{
 #if __DCACHE_PRESENT
-        // Allocate enough extra space that we can shift the start of the buffer towards higher addresses to be on a cache line.
-        // The worst case for this is when the first byte is allocated 1 byte past the start of a cache line, so we
-        // will need an additional (cache line size - 1) bytes.
-        // Additionally, since we are going to be invalidating this buffer, we can't allow any other variables to be
-        // in the same cache lines, or they might get corrupted.
-        // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
-        // The math for rounding up can be found here:
-        // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
-        size_t requiredSizeRoundedUp = (neededCapacity * elementSize + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
-        return requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
+    // Allocate enough extra space that we can shift the start of the buffer towards higher addresses to be on a cache line.
+    // The worst case for this is when the first byte is allocated 1 byte past the start of a cache line, so we
+    // will need an additional (cache line size - 1) bytes.
+    // Additionally, since we are going to be invalidating this buffer, we can't allow any other variables to be
+    // in the same cache lines, or they might get corrupted.
+    // So, we need to round up the backing buffer size to the nearest multiple of the cache line size.
+    // The math for rounding up can be found here:
+    // https://community.st.com/t5/stm32-mcus-products/maintaining-cpu-data-cache-coherence-for-dma-buffers/td-p/95746
+    size_t requiredSizeRoundedUp = (neededCapacity * elementSize + __SCB_DCACHE_LINE_SIZE - 1) & ~((__SCB_DCACHE_LINE_SIZE) - 1);
+    return requiredSizeRoundedUp + __SCB_DCACHE_LINE_SIZE - 1;
 #else
-        // No cache on this platform so don't need any extra space.
-        return neededCapacity * elementSize;
+    // No cache on this platform so don't need any extra space.
+    return neededCapacity * elementSize;
 #endif
-    }
+}
 }
 
+/** \addtogroup platform-public-api */
+/** @{*/
+/**
+ * \defgroup platform_CacheAlignedBuffer CacheAlignedBuffer class
+ * @{
+ */
+
 /**
  * @brief CacheAlignedBuffer is used by Mbed in locations where we need a cache-aligned buffer.
  *
@@ -67,7 +75,7 @@ namespace detail::cab {
  *
  * <p>%CacheAlignedBuffer provides an easy way to allocate the correct amount of space so that
  * a buffer of any size can be made cache-aligned.  To instantiate a %CacheAlignedBuffer, create one of its
- * subtypes, #StaticCacheAlignedBuffer or #DynamicCacheAlignedBuffer.</p>
+ * subtypes, StaticCacheAlignedBuffer or DynamicCacheAlignedBuffer.</p>
  *
  * <h2> Converting Code to use CacheAlignedBuffer </h2>
  * For code using static arrays, like this:
@@ -101,7 +109,7 @@ class CacheAlignedBuffer {
 
 protected:
     /// Pointer to the aligned buffer.  Must be set in each constructor of each subclass.
-    DataT * _alignedBufferPtr;
+    DataT *_alignedBufferPtr;
 
     /// Capacity of the aligned buffer, in terms of number of DataT elements
     size_t _alignedBufferCapacity;
@@ -274,7 +282,7 @@ class StaticCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
  */
 template<typename DataT>
 class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
-    uint8_t * _heapMem;
+    uint8_t *_heapMem;
 public:
     /**
      * @brief Construct new cache-aligned buffer.  Buffer will be zero-initialized and allocated from the heap.
@@ -282,7 +290,7 @@ class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
      * @param capacity Number of elements the buffer shall hold
      */
     explicit DynamicCacheAlignedBuffer(size_t capacity):
-            _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(capacity, sizeof(DataT))]())
+        _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(capacity, sizeof(DataT))]())
     {
         this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
         this->_alignedBufferCapacity = capacity;
@@ -293,7 +301,7 @@ class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
      * its data will be copied from the other buffer.
      */
     DynamicCacheAlignedBuffer(DynamicCacheAlignedBuffer const &other):
-            _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(other._alignedBufferCapacity, sizeof(DataT))])
+        _heapMem(new uint8_t[detail::cab::getNeededBackingBufferSize(other._alignedBufferCapacity, sizeof(DataT))])
     {
         this->_alignedBufferCapacity = other._alignedBufferCapacity;
         this->_alignedBufferPtr = this->findCacheLineStart(_heapMem);
@@ -313,8 +321,7 @@ class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
     DynamicCacheAlignedBuffer &operator=(DynamicCacheAlignedBuffer const &other)
     {
         // Check for self assignment
-        if(&other == this)
-        {
+        if (&other == this) {
             return *this;
         }
 
@@ -326,6 +333,8 @@ class DynamicCacheAlignedBuffer : public CacheAlignedBuffer<DataT> {
     }
 };
 
+/// @}
+
 }
 
 #endif //MBED_CACHEALIGNEDBUFFER_H
diff --git a/tools/python/run_python_tests.sh b/tools/python/run_python_tests.sh
index 208bd8b6cd7..b398106635e 100755
--- a/tools/python/run_python_tests.sh
+++ b/tools/python/run_python_tests.sh
@@ -11,7 +11,7 @@
 
 set -e
 
-PYTHON=python
+PYTHON=python3
 
 # Comma separated list of directories to exclude from coverage
 COVERAGE_EXCLUDES='--omit=python_tests/*'

From 9195ed83db38ef7f1f0ba38844d460568b409941 Mon Sep 17 00:00:00 2001
From: Jamie Smith <smit109@usc.edu>
Date: Wed, 13 Dec 2023 00:06:56 -0800
Subject: [PATCH 8/8] Update code blocks to pass spell checker

---
 platform/include/platform/CacheAlignedBuffer.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/platform/include/platform/CacheAlignedBuffer.h b/platform/include/platform/CacheAlignedBuffer.h
index 72477d14cff..50b418b7de8 100644
--- a/platform/include/platform/CacheAlignedBuffer.h
+++ b/platform/include/platform/CacheAlignedBuffer.h
@@ -79,27 +79,27 @@ constexpr inline size_t getNeededBackingBufferSize(size_t neededCapacity, size_t
  *
  * <h2> Converting Code to use CacheAlignedBuffer </h2>
  * For code using static arrays, like this:
- * \code{.cpp}
+ * @code
  * uint8_t rxBuffer[16];
  * spi.transfer_and_wait(nullptr, 0, rxBuffer, 16);
- * \endcode
+ * @endcode
  * Use a StaticCacheAlignedBuffer:
- * \code{.cpp}
+ * @code
  * StaticCacheAlignedBuffer<uint8_t, 16> rxBuffer;
  * spi.transfer_and_wait(nullptr, 0, rxBuffer, 16);
- * \endcode
+ * @endcode
  * For code using dynamic allocation to handle unknown buffer sizes, like:
- * \code{.cpp}
+ * @code
  * uint16_t * rxBuffer = new uint16_t[bufferSize];
  * spi.transfer_and_wait(nullptr, 0, rxBuffer, bufferSize);
  * ...
  * delete[] rxBuffer;
- * \endcode
+ * @endcode
  * use a DynamicCacheAlignedBuffer:
- * \code{.cpp}
+ * @code
  * DynamicCacheAlignedBuffer<uint16_t> rxBuffer(bufferSize);
  * spi.transfer_and_wait(nullptr, 0, rxBuffer, bufferSize);
- * \endcode
+ * @endcode
  *
  * @tparam DataT Type of the data to store in the buffer.  Note: %CacheAlignedBuffer is not designed for
  *    using class types as DataT, and will not call constructors.