-
Notifications
You must be signed in to change notification settings - Fork 272
/
Copy pathspi.cpp
673 lines (576 loc) · 26.1 KB
/
spi.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
#ifndef KERNEL_MODULE
#include <stdio.h> // printf, stderr
#include <syslog.h> // syslog
#include <fcntl.h> // open, O_RDWR, O_SYNC
#include <sys/mman.h> // mmap, munmap
#include <pthread.h> // pthread_create
#include <bcm_host.h> // bcm_host_get_peripheral_address, bcm_host_get_peripheral_size, bcm_host_get_sdram_address
#endif
#include "config.h"
#include "spi.h"
#include "util.h"
#include "dma.h"
#include "mailbox.h"
#include "mem_alloc.h"
// Uncomment this to print out all bytes sent to the SPI bus
// #define DEBUG_SPI_BUS_WRITES
#ifdef DEBUG_SPI_BUS_WRITES
#define DEBUG_PRINT_WRITTEN_BYTE(byte) do { \
printf("%02X", byte); \
if ((writeCounter & 3) == 0) printf("\n"); \
} while(0)
#else
#define DEBUG_PRINT_WRITTEN_BYTE(byte) ((void)0)
#endif
#ifdef CHIP_SELECT_LINE_NEEDS_REFRESHING_EACH_32BITS_WRITTEN
void ChipSelectHigh();
#define TOGGLE_CHIP_SELECT_LINE() if ((++writeCounter & 3) == 0) { ChipSelectHigh(); }
#else
#define TOGGLE_CHIP_SELECT_LINE() ((void)0)
#endif
static uint32_t writeCounter = 0;
#define WRITE_FIFO(word) do { \
uint8_t w = (word); \
spi->fifo = w; \
TOGGLE_CHIP_SELECT_LINE(); \
DEBUG_PRINT_WRITTEN_BYTE(w); \
} while(0)
int mem_fd = -1;
volatile void *bcm2835 = 0;
volatile GPIORegisterFile *gpio = 0;
volatile SPIRegisterFile *spi = 0;
// Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows
// that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)".
volatile uint64_t *systemTimerRegister = 0;
void DumpSPICS(uint32_t reg)
{
PRINT_FLAG(BCM2835_SPI0_CS_CS);
PRINT_FLAG(BCM2835_SPI0_CS_CPHA);
PRINT_FLAG(BCM2835_SPI0_CS_CPOL);
PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_TX);
PRINT_FLAG(BCM2835_SPI0_CS_CLEAR_RX);
PRINT_FLAG(BCM2835_SPI0_CS_TA);
PRINT_FLAG(BCM2835_SPI0_CS_DMAEN);
PRINT_FLAG(BCM2835_SPI0_CS_INTD);
PRINT_FLAG(BCM2835_SPI0_CS_INTR);
PRINT_FLAG(BCM2835_SPI0_CS_ADCS);
PRINT_FLAG(BCM2835_SPI0_CS_DONE);
PRINT_FLAG(BCM2835_SPI0_CS_RXD);
PRINT_FLAG(BCM2835_SPI0_CS_TXD);
PRINT_FLAG(BCM2835_SPI0_CS_RXR);
PRINT_FLAG(BCM2835_SPI0_CS_RXF);
printf("SPI0 DLEN: %u\n", spi->dlen);
printf("SPI0 CE0 register: %d\n", GET_GPIO(GPIO_SPI0_CE0) ? 1 : 0);
}
#ifdef RUN_WITH_REALTIME_THREAD_PRIORITY
#include <pthread.h>
#include <sched.h>
void SetRealtimeThreadPriority()
{
sched_param params;
params.sched_priority = sched_get_priority_max(SCHED_FIFO);
int failed = pthread_setschedparam(pthread_self(), SCHED_FIFO, ¶ms);
if (failed) FATAL_ERROR("pthread_setschedparam() failed!");
int policy = 0;
failed = pthread_getschedparam(pthread_self(), &policy, ¶ms);
if (failed) FATAL_ERROR("pthread_getschedparam() failed!");
if (policy != SCHED_FIFO) FATAL_ERROR("Failed to set realtime thread policy!");
printf("Set fbcp-ili9341 thread scheduling priority to maximum (%d)\n", sched_get_priority_max(SCHED_FIFO));
}
#endif
// Errata to BCM2835 behavior: documentation states that the SPI0 DLEN register is only used for DMA. However, even when DMA is not being utilized, setting it from
// a value != 0 or 1 gets rid of an excess idle clock cycle that is present when transmitting each byte. (by default in Polled SPI Mode each 8 bits transfer in 9 clocks)
// With DLEN=2 each byte is clocked to the bus in 8 cycles, observed to improve max throughput from 56.8mbps to 63.3mbps (+11.4%, quite close to the theoretical +12.5%)
// https://www.raspberrypi.org/forums/viewtopic.php?f=44&t=181154
#define UNLOCK_FAST_8_CLOCKS_SPI() (spi->dlen = 2)
#ifdef ALL_TASKS_SHOULD_DMA
bool previousTaskWasSPI = true;
#endif
#ifdef SPI_3WIRE_PROTOCOL
uint32_t NumBytesNeededFor32BitSPITask(uint32_t byteSizeFor8BitTask)
{
return byteSizeFor8BitTask * 2 + 4; // 16bit -> 32bit expansion, plus 4 bytes for command word
}
uint32_t NumBytesNeededFor9BitSPITask(uint32_t byteSizeFor8BitTask)
{
uint32_t numOutBits = (byteSizeFor8BitTask + 1) * 9;
// The number of bits we send out in a command must be a multiple of 9 bits, because each byte is 1 data/command bit plus 8 payload bits
// But the number of bits sent out in a command must also be a multiple of 8 bits, because BCM2835 SPI peripheral only deals with sending out full bytes.
// Therefore the bits written out must be a multiple of lcm(9*8)=72bits.
numOutBits = ((numOutBits + 71) / 72) * 72;
uint32_t numOutBytes = numOutBits >> 3;
return numOutBytes;
}
// N.B. BCM2835 hardware always clocks bytes out most significant bit (MSB) first, so when interleaving, the command bit needs to start out in the
// highest byte of the outgoing buffer.
void Interleave8BitSPITaskTo9Bit(SPITask *task)
{
const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding;
// 9-bit SPI task lives right at the end of the 8-bit task
uint8_t *dst = task->data + size8BitTask;
// Pre-clear the 9*8=72 bit tail end of the memory to all zeroes to avoid having to pad source data to multiples of 9. (plus padding bytes, just to be safe)
memset(dst + task->sizeExpandedTaskWithPadding - 9 - SPI_9BIT_TASK_PADDING_BYTES, 0, 9 + SPI_9BIT_TASK_PADDING_BYTES);
// Fill first command byte xxxxxxxx -> 0xxxxxxx x: (low 0 bit to indicate a command byte)
dst[0] = task->cmd >> 1;
dst[1] = task->cmd << 7;
int dstByte = 1;
int dstBitsUsed = 1;
int src = 0;
// Command bit above produced one byte. If there are at least 7 bytes in the data set, we can complete a set of 8 transferred bytes. Fast track
// that:
if (size8BitTask >= 7)
{
dst[1] |= 0x40 | (task->data[0] >> 2);
dst[2] = 0x20 | (task->data[0] << 6) | (task->data[1] >> 3);
dst[3] = 0x10 | (task->data[1] << 5) | (task->data[2] >> 4);
dst[4] = 0x08 | (task->data[2] << 4) | (task->data[3] >> 5);
dst[5] = 0x04 | (task->data[3] << 3) | (task->data[4] >> 6);
dst[6] = 0x02 | (task->data[4] << 2) | (task->data[5] >> 7);
dst[7] = 0x01 | (task->data[5] << 1);
dst[8] = (task->data[6] );
dstByte = 9;
dstBitsUsed = 0;
src = 7;
// More fast tracking: As long as we have multiples of 8 bytes left, fast fill them in
while(src <= size8BitTask - 8)
{
uint8_t *d = dst + dstByte;
dstByte += 9;
const uint8_t *s = task->data + src;
src += 8;
d[0] = 0x80 | (s[0] >> 1);
d[1] = 0x40 | (s[0] << 7) | (s[1] >> 2);
d[2] = 0x20 | (s[1] << 6) | (s[2] >> 3);
d[3] = 0x10 | (s[2] << 5) | (s[3] >> 4);
d[4] = 0x08 | (s[3] << 4) | (s[4] >> 5);
d[5] = 0x04 | (s[4] << 3) | (s[5] >> 6);
d[6] = 0x02 | (s[5] << 2) | (s[6] >> 7);
d[7] = 0x01 | (s[6] << 1);
d[8] = (s[7] );
}
// Pre-clear the next byte to be written - the slow loop below assumes it is continuing a middle of byte sequence
// N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
dst[dstByte] = 0;
}
// Fill tail data bytes, slow path
while(src < size8BitTask)
{
uint8_t data = task->data[src++];
// High 1 bit to indicate a data byte
dst[dstByte] |= 1 << (7 - dstBitsUsed);
++dstBitsUsed;
if (dstBitsUsed == 8) // Written data bit completes a full byte?
{
++dstByte; // Advance to next byte
dstBitsUsed = 0;
// Now we are aligned, so can write the data byte directly
dst[dstByte++] = data;
dst[dstByte] = 0; // Clear old contents of the next byte to write
}
else
{
// 8 data bits
dst[dstByte++] |= data >> dstBitsUsed;
// This is the first write to the next byte, that should occur without ORring to clear old data in memory
// N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
dst[dstByte] = data << (8 - dstBitsUsed);
}
}
#if 0 // Enable to debug correctness:
#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
#define BYTE_TO_BINARY(byte) \
(byte & 0x80 ? '1' : '0'), \
(byte & 0x40 ? '1' : '0'), \
(byte & 0x20 ? '1' : '0'), \
(byte & 0x10 ? '1' : '0'), \
(byte & 0x08 ? '1' : '0'), \
(byte & 0x04 ? '1' : '0'), \
(byte & 0x02 ? '1' : '0'), \
(byte & 0x01 ? '1' : '0')
printf("Interleaving result: 8-bit task of size %d bytes became %d bytes:\n", task->size - task->sizeExpandedTaskWithPadding, task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES);
printf("8-bit c" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->cmd));
for(int i = 0; i < task->size - task->sizeExpandedTaskWithPadding; ++i)
printf("d" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->data[i]));
printf("\n9-bit ");
for(int i = 0; i < task->sizeExpandedTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES; ++i)
printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(dst[i]));
printf("\n\n");
#endif
}
void Interleave16BitSPITaskTo32Bit(SPITask *task)
{
const uint32_t size8BitTask = task->size - task->sizeExpandedTaskWithPadding;
// 32-bit SPI task lives right at the end of the 16-bit task
uint32_t *dst = (uint32_t *)(task->data + size8BitTask);
*dst++ = task->cmd;
const uint32_t taskSizeU16 = size8BitTask >> 1;
uint16_t *src = (uint16_t*)task->data;
for(uint32_t i = 0; i < taskSizeU16; ++i)
dst[i] = 0x1500 | (src[i] << 16);
}
#endif // ~SPI_3WIRE_PROTOCOL
void WaitForPolledSPITransferToFinish()
{
uint32_t cs;
while (!(((cs = spi->cs) ^ BCM2835_SPI0_CS_TA) & (BCM2835_SPI0_CS_DONE | BCM2835_SPI0_CS_TA))) // While TA=1 and DONE=0
if ((cs & (BCM2835_SPI0_CS_RXR | BCM2835_SPI0_CS_RXF)))
spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
if ((cs & BCM2835_SPI0_CS_RXD)) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
}
#ifdef ALL_TASKS_SHOULD_DMA
#ifndef USE_DMA_TRANSFERS
#error When building with #define ALL_TASKS_SHOULD_DMA enabled, -DUSE_DMA_TRANSFERS=ON should be set in CMake command line!
#endif
// Synchonously performs a single SPI command byte + N data bytes transfer on the calling thread. Call in between a BEGIN_SPI_COMMUNICATION() and END_SPI_COMMUNICATION() pair.
void RunSPITask(SPITask *task)
{
uint32_t cs;
uint8_t *tStart = task->PayloadStart();
uint8_t *tEnd = task->PayloadEnd();
const uint32_t payloadSize = tEnd - tStart;
uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
#define TASK_SIZE_TO_USE_DMA 4
// Do a DMA transfer if this task is suitable in size for DMA to handle
if (payloadSize >= TASK_SIZE_TO_USE_DMA && (task->cmd == DISPLAY_WRITE_PIXELS || task->cmd == DISPLAY_SET_CURSOR_X || task->cmd == DISPLAY_SET_CURSOR_Y))
{
if (previousTaskWasSPI)
WaitForPolledSPITransferToFinish();
// printf("DMA cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
SPIDMATransfer(task);
previousTaskWasSPI = false;
}
else
{
if (!previousTaskWasSPI)
{
WaitForDMAFinished();
spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR_TX | DISPLAY_SPI_DRIVE_SETTINGS;
// After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode.
UNLOCK_FAST_8_CLOCKS_SPI();
}
else
WaitForPolledSPITransferToFinish();
// printf("SPI cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
// Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
#ifndef SPI_3WIRE_PROTOCOL
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
// On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands.
WRITE_FIFO(0x00);
#endif
WRITE_FIFO(task->cmd);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
spi->fifo;
spi->fifo;
#else
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
#endif
SET_GPIO(GPIO_TFT_DATA_CONTROL);
#endif
// Send the data payload:
while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++);
while(tStart < tEnd)
{
cs = spi->cs;
if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++);
// TODO: else asm volatile("yield");
if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
}
previousTaskWasSPI = true;
}
}
#else
void RunSPITask(SPITask *task)
{
WaitForPolledSPITransferToFinish();
// The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go
// low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1
// transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout fbcp-ili9341 lifetime,
// which is a tiny bit faster.
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
BEGIN_SPI_COMMUNICATION();
#endif
uint8_t *tStart = task->PayloadStart();
uint8_t *tEnd = task->PayloadEnd();
const uint32_t payloadSize = tEnd - tStart;
uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
// Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
#ifndef SPI_3WIRE_PROTOCOL
// An SPI transfer to the display always starts with one control (command) byte, followed by N data bytes.
CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
// On e.g. the ILI9486, all commands are 16-bit, so need to be clocked in in two bytes. The MSB byte is always zero though in all the defined commands.
WRITE_FIFO(0x00);
#endif
WRITE_FIFO(task->cmd);
#ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
spi->fifo;
spi->fifo;
#else
while(!(spi->cs & (BCM2835_SPI0_CS_RXD|BCM2835_SPI0_CS_DONE))) /*nop*/;
#endif
SET_GPIO(GPIO_TFT_DATA_CONTROL);
#endif // ~!SPI_3WIRE_PROTOCOL
// For small transfers, using DMA is not worth it, but pushing through with polled SPI gives better bandwidth.
// For larger transfers though that are more than this amount of bytes, using DMA is faster.
// This cutoff number was experimentally tested to find where Polled SPI and DMA are as fast.
#define DMA_IS_FASTER_THAN_POLLED_SPI 140
// Do a DMA transfer if this task is suitable in size for DMA to handle
#ifdef USE_DMA_TRANSFERS
if (tEnd - tStart > DMA_IS_FASTER_THAN_POLLED_SPI)
{
SPIDMATransfer(task);
// After having done a DMA transfer, the SPI0 DLEN register has reset to zero, so restore it to fast mode.
UNLOCK_FAST_8_CLOCKS_SPI();
}
else
#endif
{
while(tStart < tPrefillEnd) WRITE_FIFO(*tStart++);
while(tStart < tEnd)
{
uint32_t cs = spi->cs;
if ((cs & BCM2835_SPI0_CS_TXD)) WRITE_FIFO(*tStart++);
// TODO: else asm volatile("yield");
if ((cs & (BCM2835_SPI0_CS_RXR|BCM2835_SPI0_CS_RXF))) spi->cs = BCM2835_SPI0_CS_CLEAR_RX | BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS;
}
}
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
END_SPI_COMMUNICATION();
#endif
}
#endif
SharedMemory *spiTaskMemory = 0;
volatile uint64_t spiThreadIdleUsecs = 0;
volatile uint64_t spiThreadSleepStartTime = 0;
volatile int spiThreadSleeping = 0;
double spiUsecsPerByte;
SPITask *GetTask() // Returns the first task in the queue, called in worker thread
{
uint32_t head = spiTaskMemory->queueHead;
uint32_t tail = spiTaskMemory->queueTail;
if (head == tail) return 0;
SPITask *task = (SPITask*)(spiTaskMemory->buffer + head);
if (task->cmd == 0) // Wrapped around?
{
spiTaskMemory->queueHead = 0;
__sync_synchronize();
if (tail == 0) return 0;
task = (SPITask*)spiTaskMemory->buffer;
}
return task;
}
void DoneTask(SPITask *task) // Frees the first SPI task from the queue, called in worker thread
{
__atomic_fetch_sub(&spiTaskMemory->spiBytesQueued, task->PayloadSize()+1, __ATOMIC_RELAXED);
spiTaskMemory->queueHead = (uint32_t)((uint8_t*)task - spiTaskMemory->buffer) + sizeof(SPITask) + task->size;
__sync_synchronize();
}
extern volatile bool programRunning;
void ExecuteSPITasks()
{
#ifndef USE_DMA_TRANSFERS
BEGIN_SPI_COMMUNICATION();
#endif
{
while(programRunning && spiTaskMemory->queueTail != spiTaskMemory->queueHead)
{
SPITask *task = GetTask();
if (task)
{
RunSPITask(task);
DoneTask(task);
}
}
}
#ifndef USE_DMA_TRANSFERS
END_SPI_COMMUNICATION();
#endif
}
#if !defined(KERNEL_MODULE) && defined(USE_SPI_THREAD)
pthread_t spiThread;
// A worker thread that keeps the SPI bus filled at all times
void *spi_thread(void *unused)
{
#ifdef RUN_WITH_REALTIME_THREAD_PRIORITY
SetRealtimeThreadPriority();
#endif
while(programRunning)
{
if (spiTaskMemory->queueTail != spiTaskMemory->queueHead)
{
ExecuteSPITasks();
}
else
{
#ifdef STATISTICS
uint64_t t0 = tick();
spiThreadSleepStartTime = t0;
__atomic_store_n(&spiThreadSleeping, 1, __ATOMIC_RELAXED);
#endif
if (programRunning) syscall(SYS_futex, &spiTaskMemory->queueTail, FUTEX_WAIT, spiTaskMemory->queueHead, 0, 0, 0); // Start sleeping until we get new tasks
#ifdef STATISTICS
__atomic_store_n(&spiThreadSleeping, 0, __ATOMIC_RELAXED);
uint64_t t1 = tick();
__sync_fetch_and_add(&spiThreadIdleUsecs, t1-t0);
#endif
}
}
pthread_exit(0);
}
#endif
int InitSPI()
{
#ifdef KERNEL_MODULE
#define BCM2835_PERI_BASE 0x3F000000
#define BCM2835_GPIO_BASE 0x200000
#define BCM2835_SPI0_BASE 0x204000
printk("ioremapping %p\n", (void*)(BCM2835_PERI_BASE+BCM2835_GPIO_BASE));
void *bcm2835 = ioremap(BCM2835_PERI_BASE+BCM2835_GPIO_BASE, 32768);
printk("Got bcm address %p\n", bcm2835);
if (!bcm2835) FATAL_ERROR("Failed to map BCM2835 address!");
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE - BCM2835_GPIO_BASE);
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835);
#else // Userland version
// Memory map GPIO and SPI peripherals for direct access
mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)");
printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address());
bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed");
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE);
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE);
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
#endif
uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/);
uint32_t maxBcmCoreTurboSpeed = MailboxRet2(0x00030004/*Get Max Clock Rate*/, 0x4/*CORE*/);
// Estimate how many microseconds transferring a single byte over the SPI bus takes?
spiUsecsPerByte = 1000000.0 * 8.0/*bits/byte*/ * SPI_BUS_CLOCK_DIVISOR / maxBcmCoreTurboSpeed;
printf("BCM core speed: current: %uhz, max turbo: %uhz. SPI CDIV: %d, SPI max frequency: %.0fhz\n", currentBcmCoreSpeed, maxBcmCoreTurboSpeed, SPI_BUS_CLOCK_DIVISOR, (double)maxBcmCoreTurboSpeed / SPI_BUS_CLOCK_DIVISOR);
#if !defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES)
// By default all GPIO pins are in input mode (0x00), initialize them for SPI and GPIO writes
#ifdef GPIO_TFT_DATA_CONTROL
SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0x01); // Data/Control pin to output (0x01)
#endif
// The Pirate Audio hat ST7789 based display has Data/Control on the MISO pin, so only initialize the pin as MISO if the
// Data/Control pin does not use it.
#if !defined(GPIO_TFT_DATA_CONTROL) || GPIO_TFT_DATA_CONTROL != GPIO_SPI0_MISO
SET_GPIO_MODE(GPIO_SPI0_MISO, 0x04);
#endif
SET_GPIO_MODE(GPIO_SPI0_MOSI, 0x04);
SET_GPIO_MODE(GPIO_SPI0_CLK, 0x04);
#ifdef DISPLAY_NEEDS_CHIP_SELECT_SIGNAL
// The Adafruit 1.65" 240x240 ST7789 based display is unique compared to others that it does want to see the Chip Select line go
// low and high to start a new command. For that display we let hardware SPI toggle the CS line, and actually run TA<-0 and TA<-1
// transitions to let the CS line live. For most other displays, we just set CS line always enabled for the display throughout
// fbcp-ili9341 lifetime, which is a tiny bit faster.
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x04);
#ifdef DISPLAY_USES_CS1
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x04);
#endif
#else
// Set the SPI 0 pin explicitly to output, and enable chip select on the line by setting it to low.
// fbcp-ili9341 assumes exclusive access to the SPI0 bus, and exclusive presence of only one device on the bus,
// which is (permanently) activated here.
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x01);
CLEAR_GPIO(GPIO_SPI0_CE0);
#ifdef DISPLAY_USES_CS1
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x01);
#endif
#endif
spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS; // Initialize the Control and Status register to defaults: CS=0 (Chip Select), CPHA=0 (Clock Phase), CPOL=0 (Clock Polarity), CSPOL=0 (Chip Select Polarity), TA=0 (Transfer not active), and reset TX and RX queues.
spi->clk = SPI_BUS_CLOCK_DIVISOR; // Clock Divider determines SPI bus speed, resulting speed=256MHz/clk
#endif
// Initialize SPI thread task buffer memory
#ifdef KERNEL_MODULE_CLIENT
int driverfd = open("/proc/bcm2835_spi_display_bus", O_RDWR|O_SYNC);
if (driverfd < 0) FATAL_ERROR("Could not open SPI ring buffer - kernel driver module not running?");
spiTaskMemory = (SharedMemory*)mmap(NULL, SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED/* | MAP_NORESERVE | MAP_POPULATE | MAP_LOCKED*/, driverfd, 0);
close(driverfd);
if (spiTaskMemory == MAP_FAILED) FATAL_ERROR("Could not mmap SPI ring buffer!");
printf("Got shared memory block %p, ring buffer head %p, ring buffer tail %p, shared memory block phys address: %p\n", (const char *)spiTaskMemory, spiTaskMemory->queueHead, spiTaskMemory->queueTail, spiTaskMemory->sharedMemoryBaseInPhysMemory);
#ifdef USE_DMA_TRANSFERS
printf("DMA TX channel: %d, DMA RX channel: %d\n", spiTaskMemory->dmaTxChannel, spiTaskMemory->dmaRxChannel);
#endif
#else
#ifdef KERNEL_MODULE
spiTaskMemory = (SharedMemory*)kmalloc(SHARED_MEMORY_SIZE, GFP_KERNEL | GFP_DMA);
// TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer in 'spiTaskMemory'. However
// that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent.
// Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to the following intermediate 'dmaSourceMemory'
// memory area to perform the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit.
dmaSourceMemory = (SharedMemory*)dma_alloc_writecombine(0, SHARED_MEMORY_SIZE, &spiTaskMemoryPhysical, GFP_KERNEL);
LOG("Allocated DMA memory: mem: %p, phys: %p", spiTaskMemory, (void*)spiTaskMemoryPhysical);
memset((void*)spiTaskMemory, 0, SHARED_MEMORY_SIZE);
#else
spiTaskMemory = (SharedMemory*)Malloc(SHARED_MEMORY_SIZE, "spi.cpp shared task memory");
#endif
spiTaskMemory->queueHead = spiTaskMemory->queueTail = spiTaskMemory->spiBytesQueued = 0;
#endif
#ifdef USE_DMA_TRANSFERS
InitDMA();
#endif
// Enable fast 8 clocks per byte transfer mode, instead of slower 9 clocks per byte.
UNLOCK_FAST_8_CLOCKS_SPI();
#if !defined(KERNEL_MODULE) && (!defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES))
printf("Initializing display\n");
InitSPIDisplay();
#ifdef USE_SPI_THREAD
// Create a dedicated thread to feed the SPI bus. While this is fast, it consumes a lot of CPU. It would be best to replace
// this thread with a kernel module that processes the created SPI task queue using interrupts. (while juggling the GPIO D/C line as well)
printf("Creating SPI task thread\n");
int rc = pthread_create(&spiThread, NULL, spi_thread, NULL); // After creating the thread, it is assumed to have ownership of the SPI bus, so no SPI chat on the main thread after this.
if (rc != 0) FATAL_ERROR("Failed to create SPI thread!");
#else
// We will be running SPI tasks continuously from the main thread, so keep SPI Transfer Active throughout the lifetime of the driver.
BEGIN_SPI_COMMUNICATION();
#endif
#endif
LOG("InitSPI done");
return 0;
}
void DeinitSPI()
{
#ifdef USE_SPI_THREAD
pthread_join(spiThread, NULL);
spiThread = (pthread_t)0;
#endif
DeinitSPIDisplay();
#ifdef USE_DMA_TRANSFERS
DeinitDMA();
#endif
spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
#ifndef KERNEL_MODULE_CLIENT
#ifdef GPIO_TFT_DATA_CONTROL
SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0);
#endif
SET_GPIO_MODE(GPIO_SPI0_CE1, 0);
SET_GPIO_MODE(GPIO_SPI0_CE0, 0);
SET_GPIO_MODE(GPIO_SPI0_MISO, 0);
SET_GPIO_MODE(GPIO_SPI0_MOSI, 0);
SET_GPIO_MODE(GPIO_SPI0_CLK, 0);
#endif
if (bcm2835)
{
munmap((void*)bcm2835, bcm_host_get_peripheral_size());
bcm2835 = 0;
}
if (mem_fd >= 0)
{
close(mem_fd);
mem_fd = -1;
}
#ifndef KERNEL_MODULE_CLIENT
#ifdef KERNEL_MODULE
kfree(spiTaskMemory);
dma_free_writecombine(0, SHARED_MEMORY_SIZE, dmaSourceMemory, spiTaskMemoryPhysical);
spiTaskMemoryPhysical = 0;
#else
free(spiTaskMemory);
#endif
#endif
spiTaskMemory = 0;
}