void setPixel(uint32_t r, uint32_t c, uint32_t d){
  //ensure valid limits
  c%=512;//column 0+ corresponds to most negative theta and goes to most positive theta (theta=0 at c=255.5)
  r%=128;//row 0+ corresponds to zenith going down to horizon in phi
  d%=4;
  uint32_t s=(c%16)*2;
  uint32_t bc=~(0x03<<s);
  uint32_t i=r*32+(c/16);
  pixel_buffer[i]=(pixel_buffer[i]&(bc))|(d<<s);
}

void setPixelOr(uint32_t r, uint32_t c, uint32_t d){
  //ensure valid limits
  c%=512;//column 0+ corresponds to most negative theta and goes to most positive theta (theta=0 at c=255.5)
  r%=128;//row 0+ corresponds to zenith going down to horizon in phi
  d%=4;
  uint32_t s=(c%16)*2;
  uint32_t i=r*32+(c/16);
  pixel_buffer[i]=pixel_buffer[i]|(d<<s);
}

void setDrawingMode(byte pixel){
  // 0 = reset and pulse lines active, lasers active for pulse period only and with reset clock (fast pixels)
  // 1 = pulse line fixed, lasers active on latch and reset after reset time duration (slow pixels)
  // 2 = reset line fixed, lasers active on pulse period only (fast pixels without reset)
  // 3 = reset and pulse lines fixed, lasers active on latch and stay on until zeroed (lines)
  pinMode(32, OUTPUT);
  digitalWrite(32, LOW);//reset LDs
  pinMode(8, OUTPUT);
  digitalWrite(8, HIGH);//disable outputs
  delayMicroseconds(1);
  if(pixel==0){
    IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_12=4; // ~reset - HW pin 32 to FlexIO2.12
    IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_00=4; // ~PCLK - HW pin 8 to FlexIO2.16
  }
  if(pixel==1){
    IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_12=4; // ~reset - HW pin 32 to FlexIO2.12
    digitalWrite(8, LOW);//enable outputs
  }
  if(pixel==2){
    IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_00=4; // ~PCLK - HW pin 8 to FlexIO2.16
    digitalWrite(32, HIGH);//operate LDs
  }
  if(pixel==3){
    IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_12=5; // ~reset - HW pin 32 to GPIO
    IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_00=5; // ~PCLK - HW pin 8 to GPIO
    digitalWrite(32, HIGH);//operate LDs
    digitalWrite(8, LOW);//enable outputs
  }
}

void setBrightMode(byte brt){
  if(brt==2){//bright mode = no reset pulse, briefest pulse is first in order, pulses are summed
    setDrawingMode(2);
    bright_mode=2;
  }else{//dark mode = use reset pulse, briefest pulse is second in order, pulses not summed
    if(brt==1){
      setDrawingMode(2);
      bright_mode=1;
    }else{
      setDrawingMode(0);
      bright_mode=0;
    }
  }
  if(display_active) generateLaserBuffer();//redo bit order
}

void generateTestPattern(uint32_t pt){
  clearLaserBuffer();
  if(pt==0) { return; }
  if(pt<256){
    //turn on single LD
    int byo=pt/32;//32bit Dword offset from 0 element
    int bio=pt%32;//bit offset from LSB
    uint32_t dwd=1<<bio;
    laser_buffer[byo]=dwd;
    laser_buffer[byo+8]=dwd;//use long pulse
    laser_buffer[byo+16]=dwd;//use long pulse
    //for(int a=byo; a<8192; a+=8){
      //laser_buffer[a]=dwd;
    //}
  }else{
    int byo=0;
    for(uint32_t ld=0; ld<256; ld++){
      //int byo=ld/32;//32bit Dword offset from 0 element
      int bio=ld%32;//bit offset from LSB
      uint32_t dwd=1<<bio;
      if(bio!=0){
        laser_buffer[byo]=dwd;//use medium pulse
      }else{
        laser_buffer[byo+16]=dwd;//use long pulse
      }
      if(bio==31){
        laser_buffer[byo+8]=dwd;//use short pulse
        byo+=49;
      }else{
        byo+=48;//skip every other pixel slot
      }
      //for(int a=byo; a<8192; a+=8){
        //laser_buffer[a]=dwd;
      //}
    }
  }
}

void generateLaserBuffer(){
  //this scans through pixel buffer
  clearLaserBuffer();
  byte bmloc=bright_mode;//make local copy to avoid changing mid-function
  for(uint32_t r=0; r<128; r++){
    for(uint32_t c=0; c<512; c++){
      uint32_t p=(c+north_pixel_shift)%512;
      uint32_t i=(r<<5)+(p>>4);//requested pixel data
      uint32_t d=(pixel_buffer[i]>>((p%16)*2))&0x03;//LSBs are lower column indices
      if(d==0){ continue; }
      //solving for laser and mstep given an index (i0=row(phi),j0=column(theta)) (0-based) in the above matrix
      //if (i0+j0) is even
      //  l=pmodulo((numlasers/4)-(i0-j0)/2,numlasers);
      //  m=pmodulo((numlasers+1)+2*(j0)+2*numlasers,4*numlasers)-2*numlasers;
      //else (i0+j0) is odd
      //  l=pmodulo((i0+j0+1)/2-(numlasers/4),numlasers);
      //  m=pmodulo((-numlasers+1)+2*(j0)+2*numlasers,4*numlasers)-2*numlasers;
      const uint32_t numlasers=256;
      uint32_t l,m;
      if((r+c)%2==0){
        l=(numlasers/4)-(r-c)/2;//laser index, 0 to numlasers-1; laser 0 faces mirror at theta=0
        m=(numlasers+1)+2*(c);//mirror step, 0 to (4*numlasers)-1, odd only; mirror faces lasers at mod4, theta=0 is at m=0, pixel flashes are at all odd m (which is total 2*numlasers)
      }else{
        l=(r+c+1)/2-(numlasers/4);
        m=(1-numlasers)+2*(c);
      }
      l%=numlasers;//return to laser basis
      m=((m%(numlasers*4))-1);//flexio output index, 0 to (4*numlasers)-2 =1022, even only (2x mirror theta index to allow for 2-output pixel then by 8*1.5 for 3-outputs)
      
      uint32_t i2=(m*12)+(l>>5);//step 1
      uint32_t s=l%32;
      uint32_t bs=(1<<s);//LSB corresponds to lower laser number
      
      if(bmloc==0){
        //step 1 = medium brightness
        if(d==1){
          i2+=8;//step 2 = low brightness
        }else{
          if(d==3){
            i2+=16;//step 3 = high brightness
          }
        }
        laser_buffer[i2]|=bs;
      }else{
        if(bmloc==1){
          //step 1 = low brightness, 2 = medium, 3 = high
          if(d==2){
            i2+=8;
          }else{
            if(d==3){
              i2+=16;
            }
          }
          laser_buffer[i2]|=bs;
        }else{
          //step 1 = low brightness, 1+2 = medium, 1+2+3 = high
          laser_buffer[i2]|=bs;
          if(d>1){
            i2+=8;
            laser_buffer[i2]|=bs;
            if(d>2){
              i2+=8;
              laser_buffer[i2]|=bs;
            }
          }  
        }      
      }
    }
  }
}

void clearPixelBuffer(){
  memset(pixel_buffer, 0, 4096*4);
}

void clearLaserBuffer(){
  memset(laser_buffer, 0, laser_buffer_bytes);
}

void clearShiftbufs(){
  memset(&FLEXIO2_SHIFTBUF0, 0, 8*4);
}

void turnOnLDFlex(int ld){
  ld=constrain(ld,0,255);
  int dpin=ld/32;//0 to 7
  volatile uint32_t* douts[]={&FLEXIO2_SHIFTBUFBIS0,&FLEXIO2_SHIFTBUFBIS1,&FLEXIO2_SHIFTBUFBIS2,&FLEXIO2_SHIFTBUFBIS3,&FLEXIO2_SHIFTBUFBIS4,&FLEXIO2_SHIFTBUFBIS5,&FLEXIO2_SHIFTBUFBIS6,&FLEXIO2_SHIFTBUFBIS7};
  //uint32_t* douts[]={&FLEXIO2_SHIFTBUF0,&FLEXIO2_SHIFTBUF1,&FLEXIO2_SHIFTBUF2,&FLEXIO2_SHIFTBUF3,&FLEXIO2_SHIFTBUF4,&FLEXIO2_SHIFTBUF5,&FLEXIO2_SHIFTBUF6,&FLEXIO2_SHIFTBUF7};
  uint32_t sr=1<<(ld%32);//note the first bit to be shifted out will be LSB, then writing to bit-swapped register means MSB here corresponds to high number LD
  *douts[dpin]=sr;
}

void turnOffLDFlex(int ld){
  ld=constrain(ld,0,255);
  int dpin=ld/32;//0 to 7
  volatile uint32_t* douts[]={&FLEXIO2_SHIFTBUFBIS0,&FLEXIO2_SHIFTBUFBIS1,&FLEXIO2_SHIFTBUFBIS2,&FLEXIO2_SHIFTBUFBIS3,&FLEXIO2_SHIFTBUFBIS4,&FLEXIO2_SHIFTBUFBIS5,&FLEXIO2_SHIFTBUFBIS6,&FLEXIO2_SHIFTBUFBIS7};
  //uint32_t* douts[]={&FLEXIO2_SHIFTBUF0,&FLEXIO2_SHIFTBUF1,&FLEXIO2_SHIFTBUF2,&FLEXIO2_SHIFTBUF3,&FLEXIO2_SHIFTBUF4,&FLEXIO2_SHIFTBUF5,&FLEXIO2_SHIFTBUF6,&FLEXIO2_SHIFTBUF7};
  uint32_t sr=0;//1<<(ld%32);//note the first bit to be shifted out will be LSB, then writing to bit-swapped register means MSB here corresponds to high number LD
  *douts[dpin]=sr;
}

void configureFlexIO2Timers(){
    //Set up timers
  // 7.5 MHz = 4.27 us to shift out 32 bits
  // Incoming FlexIO rising or falling edge trigger -> Timer0 to output 3 rising edges to trigger data output ->
  //                                        .1                                .2                                .3
  //    Timer1 to wait until 2 steps sent out + latched -> Timer3 to output reset signal                       .
  //    Timer2 to wait until 1 step sent out + latched -> Timer7 to output pulse signal                     ............................................................
  //    Timer4 to send out data             ................................  ................................  ................................
  //    Timer5 to wait until data sent out -> Timer6 to output latch signal .                                 .                                 .
  //         Time from FlexIO trigger      0                                4.3                            9.27                                              19 20
  //         Visible light                                                                                  112                                 333333333333333333333333
  // At 30 Hz * 512 pixels there are max 65 us per pixel, and at 19 Hz max 102 us per pixel
  //Use TIMER0 to trigger multiple shifts per one edge of incoming pixel trigger, use in 8-bit counters baud mode, with lower byte setting 32-bit shift duration, and upper byte setting shift multiplier
  //don't output timer0 on pin; "TIMCFGn should be configured before setting TIMOD" from manual
  FLEXIO2_TIMCFG0 = FLEXIO_TIMCFG_TIMOUT(0) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(7); //run timer once, from FlexIO clock, enabled by trigger rising or falling edge, start with high state
  FLEXIO2_TIMCTL0 = FLEXIO_TIMCTL_TRGSEL(0) | FLEXIO_TIMCTL_TIMOD(1); //operate in dual 8-bit baud mode, external trigger 0 (from XBAR1), and don't output to pin
  if(pxt0lsb<65){pxt0lsb=65;}
  //FLEXIO2_TIMCMP0 = 0x0547; //delay by 4.8 us (short pulse duration of which 4.4 us are send+latch) = 144 cycles of 30 MHz (144/2-1=0x47), 3 transfers (3*2-1=0x05)
  FLEXIO2_TIMCMP0 = 0x0500 | pxt0lsb;
  
  //Use TIMER1 to wait for data shift + latch + short pulse before reset
  FLEXIO2_TIMCFG1 = FLEXIO_TIMCFG_TIMOUT(0) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(1); //run timer once, from FlexIO clock; enable with timer0, disable on compare
  FLEXIO2_TIMCTL1 = FLEXIO_TIMCTL_TIMOD(3); //operate in 16-bit counter mode, no pin output
  //FLEXIO2_TIMCMP1 = 283; //delay by 284 cycles of 30 MHz, 284-1=283
  FLEXIO2_TIMCMP1 = pxt1dly;
  
  //Use TIMER2 to wait for data shift + latch + short pulse before enable
  FLEXIO2_TIMCFG2 = FLEXIO_TIMCFG_TIMOUT(0) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(1); //run timer once, from FlexIO clock; enable with timer1, disable on compare
  FLEXIO2_TIMCTL2 = FLEXIO_TIMCTL_TIMOD(3); //operate in 16-bit counter mode, no pin output
  //FLEXIO2_TIMCMP2 = 211; //delay by 212 cycles of 30 MHz, 212-1=211
  FLEXIO2_TIMCMP2 = pxt2dly;
    
  //Use TIMER3 to output the ~reset clock to HW pin 32 after data has been latched and shown with long pulse after TIMER1
  FLEXIO2_TIMCFG3 = FLEXIO_TIMCFG_TIMOUT(0) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(6); //run timer once, from FlexIO clock; start from rising edge of trigger
  FLEXIO2_TIMCTL3 = FLEXIO_TIMCTL_TRGSEL(4*1+3) | FLEXIO_TIMCTL_TRGPOL | FLEXIO_TIMCTL_TRGSRC | FLEXIO_TIMCTL_PINCFG(3) | FLEXIO_TIMCTL_PINSEL(12) | FLEXIO_TIMCTL_PINPOL | FLEXIO_TIMCTL_TIMOD(3); //operate in 16-bit conter mode once timer1 finished, inverted output (PINPOL) goes low for pulse duration
  FLEXIO2_TIMCMP3 = 1; //cycles of 30 MHz, 0x01 to match above baud rate

  //Use TIMER4 to set the sampling rate and number of bits and output BCLK, use in 8-bit counters baud mode, with lower byte setting 2.5 MHz, and upper byte setting 32 bits of acquisition before loading buffer
  //output timer4 on HW pin 6; "TIMCFGn should be configured before setting TIMOD" from manual
  FLEXIO2_TIMCFG4 = FLEXIO_TIMCFG_TIMOUT(1) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(6); //run timer once, from FlexIO clock, enabled by trigger rising edge, start with low state
  FLEXIO2_TIMCTL4 = FLEXIO_TIMCTL_TRGSEL(4*0+3) | FLEXIO_TIMCTL_TRGSRC | FLEXIO_TIMCTL_PINCFG(3) | FLEXIO_TIMCTL_PINSEL(10) | FLEXIO_TIMCTL_TIMOD(1); //operate in dual 8-bit baud mode, internal trigger from timer0, and output to pin f10
  FLEXIO2_TIMCMP4 = 0x3F01; //baud divisor of 4 (30/4=7.5 MHz) (4/2-1=0x01), 32 bits (32*2-1=0x3F) = total sending period 4.27 us or 128 cycles of 30 MHz

  //Use TIMER5 to wait until data has been shifted out before sending latch signal
  FLEXIO2_TIMCFG5 = FLEXIO_TIMCFG_TIMOUT(0) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(1); //run timer once, from FlexIO clock; enable with timer4, disable on compare
  FLEXIO2_TIMCTL5 = FLEXIO_TIMCTL_PINCFG(0) | FLEXIO_TIMCTL_TIMOD(3); //operate in 16-bit counter mode, no pin output
  //FLEXIO2_TIMCMP5 = 129; //2 counts (from 0x01 baud above) * 2 transitions * 32 bits + 2 counts half-period delay - 1 = 129; 4.3 us or 129 cycles of 30 MHz from start of send to latch rising edge
  if(pxt5dly<129){pxt5dly=129;}
  FLEXIO2_TIMCMP5 = pxt5dly;
  
  //Use TIMER6 to output latch signal on HW pin 7 after TIMER5
  FLEXIO2_TIMCFG6 = FLEXIO_TIMCFG_TIMOUT(0) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(6); //run timer once, from FlexIO clock; start from rising edge of trigger
  FLEXIO2_TIMCTL6 = FLEXIO_TIMCTL_TRGSEL(4*5+3) | FLEXIO_TIMCTL_TRGPOL | FLEXIO_TIMCTL_TRGSRC | FLEXIO_TIMCTL_PINCFG(3) | FLEXIO_TIMCTL_PINSEL(17) | FLEXIO_TIMCTL_TIMOD(3); //operate in 16-bit counter mode once timer4 finished
  FLEXIO2_TIMCMP6 = 1; //1 to match above duty cycle (0x01 baud rate) of 2 30 MHz cycles per pulse

  //Use TIMER7 to output the ~pulse low signal to HW pin 8 after data has been shifted and latched after TIMER2
  FLEXIO2_TIMCFG7 = FLEXIO_TIMCFG_TIMOUT(2) | FLEXIO_TIMCFG_TIMDEC(0) | FLEXIO_TIMCFG_TIMRST(6) | FLEXIO_TIMCFG_TIMDIS(2) | FLEXIO_TIMCFG_TIMENA(6); //run timer once, from FlexIO clock; start + reset from rising edge of trigger
  FLEXIO2_TIMCTL7 = FLEXIO_TIMCTL_TRGSEL(4*2+3) | FLEXIO_TIMCTL_TRGPOL | FLEXIO_TIMCTL_TRGSRC | FLEXIO_TIMCTL_PINCFG(3) | FLEXIO_TIMCTL_PINSEL(16) | FLEXIO_TIMCTL_PINPOL | FLEXIO_TIMCTL_TIMOD(3); //operate in 16-bit conter mode once timer6 finished, inverted output (PINPOL) goes low for pulse duration
  //FLEXIO2_TIMCMP7 = 625; //626-1=625, pulse length 626 cycles of 30 MHz which includes all 3 brightness steps
  FLEXIO2_TIMCMP7 = pxt7dly;

  //attachInterruptVector(IRQ_FLEXIO2, &dataShiftedIRQ);
  //NVIC_ENABLE_IRQ(IRQ_FLEXIO2);
  //FLEXIO2_SHIFTSIEN=0x80; //enable interrupt on SHIFT7 shifted
  //FLEXIO2_SHIFTSDEN=1; //enable DMA transfer request on shifter 0 status flag (assume all shifters synchronized so this flag can be used to load others as well)
}

void setupFlexIO2(){
  //configure FlexIO2
  CCM_CCGR3 |= CCM_CCGR3_FLEXIO2(CCM_CCGR_ON); //sends clock to FlexIO 2 and 3
  delayMicroseconds(1);//wait for modules to initialize
  
  FLEXIO2_CTRL |= 1; //enable FlexIO2
  //Default FlexIO clock frequency is 30 MHz

  //set up pins
  pinMode(34, OUTPUT);
  digitalWrite(34, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_13=4; // LD8 - HW pin 34 to FlexIO2.29

  pinMode(35, OUTPUT);
  digitalWrite(35, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_12=4; // LD7 - HW pin 35 to FlexIO2.28

  pinMode(36, OUTPUT);
  digitalWrite(36, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_02=4; // LD6 - HW pin 36 to FlexIO2.18

  pinMode(37, OUTPUT);
  digitalWrite(37, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_03=4; // LD5 - HW pin 37 to FlexIO2.19

  pinMode(9, OUTPUT);
  digitalWrite(9, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_11=4; // LD4 - HW pin 9 to FlexIO2.11

  pinMode(10, OUTPUT);
  digitalWrite(10, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_00=4; // LD3 - HW pin 10 to FlexIO2.0

  pinMode(11, OUTPUT);
  digitalWrite(11, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_02=4; // LD2 - HW pin 11 to FlexIO2.2

  pinMode(12, OUTPUT);
  digitalWrite(12, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_01=4; // LD1 - HW pin 12 to FlexIO2.1

  pinMode(32, OUTPUT);
  digitalWrite(32, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_12=4; // ~reset - HW pin 32 to FlexIO2.12
  
  pinMode(6, OUTPUT);
  digitalWrite(6, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_10=4; // BCLK - HW pin 6 to FlexIO2.10

  pinMode(7, OUTPUT);
  digitalWrite(7, LOW);
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_01=4; // LCLK - HW pin 7 to FlexIO2.17

  pinMode(8, OUTPUT);
  digitalWrite(8, HIGH);//disable outputs
  //IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_00=4; // ~PCLK - HW pin 8 to FlexIO2.16
  
  //Set up shifters
  //Shift data out of buffers, SHIFT7=LD8 to SHIFT0=LD1
  FLEXIO2_SHIFTCTL7 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(29) | FLEXIO_SHIFTCTL_SMOD(2); //shift on falling edge of TIMER3 from pin f29
  FLEXIO2_SHIFTCTL6 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(28) | FLEXIO_SHIFTCTL_SMOD(2); 
  FLEXIO2_SHIFTCTL5 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(18) | FLEXIO_SHIFTCTL_SMOD(2); 
  FLEXIO2_SHIFTCTL4 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(19) | FLEXIO_SHIFTCTL_SMOD(2); 
  FLEXIO2_SHIFTCTL3 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(11) | FLEXIO_SHIFTCTL_SMOD(2); 
  FLEXIO2_SHIFTCTL2 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(0) | FLEXIO_SHIFTCTL_SMOD(2); 
  FLEXIO2_SHIFTCTL1 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(2) | FLEXIO_SHIFTCTL_SMOD(2); 
  FLEXIO2_SHIFTCTL0 = FLEXIO_SHIFTCTL_TIMSEL(4) | FLEXIO_SHIFTCTL_TIMPOL | FLEXIO_SHIFTCTL_PINCFG(3) | FLEXIO_SHIFTCTL_PINSEL(1) | FLEXIO_SHIFTCTL_SMOD(2); 
  
  configureFlexIO2Timers();
  
  //Set up DMA Channel:
  ldWriter.begin();
  ldWriter.sourceBuffer(laser_buffer, laser_buffer_bytes); 
  ldWriter.destinationBuffer(&FLEXIO2_SHIFTBUF0, 8*4);//write all 8 shift buffer registers
  //minor loop mapping is enabled by default in .begin()  C:\Program Files (x86)\Arduino\hardware\teensy\avr\cores\teensy4\DMAChannel.cpp
  ldWriter.TCD->NBYTES_MLOFFYES=(1<<30)|(((0-32)&0xFFFFF)<<10)|(8*4);//transfer 32 bytes per minor loop to fill all shift buffer registers, enable destination offset minor loop map of -32 bytes to return to beginning of shift registers
  ldWriter.TCD->BITER=laser_buffer_bytes/32;//complete major loop after transferring the whole laser_buffer
  ldWriter.TCD->CITER=laser_buffer_bytes/32;// thereby returning source address to beginning, destination address unaffected as it is returned by minor loop mapping
  //ldWriter.TCD->DLASTSGA=0;//at end of major loop the minor loop shift is ignored
  ldWriter.triggerAtHardwareEvent(DMAMUX_SOURCE_FLEXIO2_REQUEST0);//this request is on status flag of SHIFTBUF0 OR SHIFTBUF1, which acts for the remaining ones as well since they are synchronized
  //when ready to start:
  //ldWriter.enable(); //Note DMA channel should be enabled before enabling the hardware trigger, and disabled in reverse order
  //this does not require any further attention after starting; change laser_buffer to change the displayed image

  //The starting sequence should be:
  //  Wait until appropriate mechanical phase angle (at pixel -1 or earlier, giving adequate time until next timer compare of pixel 0)
  //  Initialize TMR3.0 (FlexIO2 trigger output) so that COMP10 is at time of pixel 0 and CMPLD10 is at time of pixel 1
  //  Preload time values for pixels 2 to 17 (to 33) into timer_buffer (this is needed because it is required that BITER=CITER at start of DMA, that is we cannot specify that a minor loop should be skipped)
  //  Initialize tmrWriter DMA channel to point to first entry in timer_buffer, enable interrupt upon completion
  //    tmrWriter.TCD->SADDR=timer_buffer;
  //  Initialize ldWriter DMA channel to point to data for second brightness step of pixel 0, manually write pixel 0 first step contents into SHIFTBUF
  //    ldWriter.TCD->SADDR=laser_buffer+32;
  //  Enable TMR3.0 output to FlexIO2
  //  Enable FlexIO2 triggering
  //Then after a pixel period, TMR3.0 will equal COMP10 for pixel 0
  //  Trigger FlexIO2 to output pixel 0 first step, DMA request #1 to load data from laser_buffer loads pixel 0 second step, then repeat trigger outputs pixel 0 second step and DMA request #2 loads data for pixel 1 first step
  //  Load CMPLD10 into COMP10 to define pixel 1 trigger time, DMA request #1 to load pixel 2 time into CMPLD10
  //Then TMR3.0 equals COMP10 for pixel 1
  //  Trigger FlexIO2 same as above
  //  Load CMPLD10 into COMP10 to define pixel 2 trigger time, DMA request #2 to load pixel 3 time into CMPLD10
  //... so on until completing 16 (opt_edges_every) steps of DMA timer updates
  //TMR3.0 equals COMP10 for pixel 15
  //  Trigger FlexIO2 to output pixel 15 first step, DMA request #30 to load data from laser_buffer loads pixel 15 second step, then repeat trigger outputs pixel 0 second step and DMA request #31 loads data for pixel 16 first step
  //  Load CMPLD10 into COMP10 to define pixel 16 trigger time, DMA request #16 to load pixel 17 time into CMPLD10
  //    Interrupt called due to completion of tmrWriter DMA major loop (CITER=0), in the interrupt calculate the times for pixels 18 to 33 and write into timer_buffer, then continue with DMA
  // OR use a double-buffer scheme in case there is insufficient time between pixels to finish the above work:
  //TMR3.0 equals COMP10 for pixel 15
  //  Trigger FlexIO2 to output pixel 15 first step, DMA request #30 to load data from laser_buffer loads pixel 15 second step, then repeat trigger outputs pixel 0 second step and DMA request #31 loads data for pixel 16 first step
  //  Load CMPLD10 into COMP10 to define pixel 16 trigger time, DMA request #16 to load pixel 17 time into CMPLD10
  //    Interrupt called due to half-completion of tmrWriter DMA major loop (CITER=BITER/2), in the interrupt calculate the times for pixels 18 to 33 (to 49) and write into timer_buffer from half-way point, then continue with DMA
  //Later TMR3.0 equals COMP10 for pixel 31
  //  Load CMPLD10 into COMP10 to define pixel 32 trigger time, DMA request #32 to load pixel 33 time into CMPLD10
  //    Interrupt called due to completion of tmrWriter DMA major loop (CITER=0), in the interrupt calculate the times for pixels 34 to 49 (to 65) and write into timer_buffer from beginning, then continue with DMA
  //TMR3.1 edge capture interrupt carries out the synchronization variable calculation, it is called at a reduced rate relative to optical edges based on TMR3.2 divider
  //  However the implementation of this does not make any difference for the pixel algorithms above, as they just use the latest ld_sync_var as input
}

void startDisplay(bool synctomotor){
  if(display_active) return;

  if(!eth_led_enable){ ethernetLEDState(false); }
  //if(!p13_led_enable){ digitalWrite(13, LOW); }
  
  pinMode(32, OUTPUT);
  digitalWrite(32, LOW);//reset LDs
  TMR3_ENBL=0;//disable timer3.x and its interrupts
  delayMicroseconds(1);
  //digitalWrite(8, LOW);//enable outputs
  //IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_12=4; // ~reset - HW pin 32 to FlexIO2.12
  //IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_00=4; // ~PCLK - HW pin 8 to FlexIO2.16
  setBrightMode(bright_mode);//connect ~PCLK and ~reset to either FlexIO or GPIO depending on brightness mode
  Serial.println("Starting capture");
  capture_ts_write=0;
  display_timer=0;
  opt_state=0;//restart theta observer
  clearLaserBuffer();
  clearPixelBuffer();
  clearShiftbufs();//ensure zero output in case FlexIO2 trigger activates during spin-up
  TMR3_COMP10=0xFFFF;//arbitrary value until we synchronize with rotor
  TMR3_CMPLD10=0xFFFF;//keep same value to reduce unsynchronized interrupt rate
  for(int a=0; a<(timer_buffer_bytes/2); a++){
    timer_buffer[a]=0xFFFF;
  }
  TMR3_CNTR0=0;//zero counters to ensure the input and output timers are synchronized
  TMR3_CNTR1=0;
  opt1_el=0; opt2_el=0;
  opt1_el_min=1000000; opt1_el_max=0; opt1_dur_min=1000000; opt1_dur_max=0; opt2_el_min=1000000; opt2_el_max=0; opt2_dur_min=1000000; opt2_dur_max=0;
  TMR3_SCTRL1|=TMR_SCTRL_IEFIE;//enable edge capture interrupt
  elapsedMillis spinup=0;
  TMR3_ENBL=15;//enable timer3.x and its interrupts

  //bool synctomotor=true;//set to false to start DMA without mirror synchronization for debugging

  if(synctomotor){
    if(motor_auto_start){
      setMotorLevel(true, motor_default_speed);
    }
    //wait for motor to spin up and theta observer to stabilize
    while(opt_state!=15){
      Serial.print("t=");
      Serial.print(spinup);
      Serial.print(", State=");
      Serial.print(opt_state);
      Serial.print(", RPS=");
      Serial.println(get_motor_speed());
      if(spinup>motor_spinup_time){
        Serial.println("Spin-up error");
        stopDisplay();
        return;
      }
      delay(500);
    }
    Serial.print("Spin-up success, RPS=");
    Serial.println(get_motor_speed());
    const uint32_t pixel_s=opt_edges_dec+(9-opt_edges_cirs);//account for coarser discretization of thetas vs pixels
    const uint32_t pixel_sd=1<<pixel_s;//divider (32)
    const float pixel_fsd=1.0f/(float)pixel_sd;//(1/32) how much of a fraction of a theta step one pixel accounts for (16*32=512)
    target_pixel=0;//in pixel frame 0-511
    float d_pixel;
    uint32_t cc;
    uint16_t tref;
    do{
      delay(10);
      //theta observer is ready, now synchronize to buffers
      uint32_t sv=ld_sync_var;
      uint16_t ref_theta=(uint16_t)(((sv&0xFFFF)>>((16-opt_edges_cirs)+opt_edges_dec))<<pixel_s);//shifted to pixel frame 0-511 and 16 discrete values {0,32,64...480}
      cc=(uint32_t)(sv&(0xFFFF>>(opt_edges_cirs-opt_edges_dec)));//timer ticks per reduced theta step
      tref=(uint16_t)(sv>>16);//timer reference counter at reference theta
      float ref_pixel=((float)ref_theta)+theta_pixel_shift+theta_pixel_vshift/cc;//pixel value corresponding to reference theta by +-512 for range (-512 to 1023)
      d_pixel=(target_pixel-ref_pixel);//with target_pixel (0 to 511), this is (-1023 to 1023)
      while(d_pixel>=512){
        d_pixel-=512;
      }
      while(d_pixel<0){//wrap around overflow so that target is always in the future
        d_pixel+=512;
      }
    }while(d_pixel<15); //wait until there is adequate time to set everything up before the target pixel is called
    //if there is too much forward time, that is ok as on the next interrupt this will be reduced

    //Timer will be triggered for pixel 0 by COMP1, then CMPLD1 is for pixel 1, next buffer entry is for pixel 2
    TMR3_COMP10=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);//target_pixel = pixel 0
    d_pixel+=1;
    TMR3_CMPLD10=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);//pixel 1
    d_pixel+=1;
    for(int a=0; a<(timer_buffer_bytes/2); a++){
      timer_buffer[a]=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);//pixels 2-33
      d_pixel=d_pixel+1;//no wrap-around here
    }
    target_pixel=34;//next unassigned pixel to put in timer_buffer, at that DMA interrupt time (pix33->CMPLD1,pix32->COMP1) the FlexIO will be outputting pixel 31
    //now enable DMA channels and requests
  }

  tmrWriter.sourceCircular(timer_buffer, timer_buffer_bytes);//this resets address to beginning of buffer and sets BITER and CITER to define number of minor loop counts in major loop, as well as NUMBYTES=2 bytes transferred per minor loop
  tmrWriter.enable(); //Note DMA channel should be enabled before enabling the hardware trigger, and disabled in reverse order
  //tmrWriter.triggerManual();//preload value if needed
  TMR3_DMA0=TMR_DMA_CMPLD1DE;//enable DMA request upon using CMPLD10
  

  ldWriter.TCD->SADDR=laser_buffer;//reset to beginning of buffer in case of past use
  ldWriter.TCD->BITER=laser_buffer_bytes/32;//reset to beginning
  ldWriter.TCD->CITER=laser_buffer_bytes/32;
  ldWriter.enable(); //Note DMA channel should be enabled before enabling the hardware trigger, and disabled in reverse order
  ldWriter.triggerManual();//preload pixel 0 short pulse into buffers so next hardware request will cause loading of pixel 0 long pulse
  FLEXIO2_SHIFTSDEN=1; //enable DMA transfer request on shifter 0 status flag (assume all shifters synchronized so this flag can be used to load others as well)

  display_active=true;
}

void stopDisplay(){
  if(!display_active) return;
  
  TMR3_SCTRL1&=~TMR_SCTRL_IEFIE;//stop interrupts
  TMR3_DMA0=0;//stop DMA request
  FLEXIO2_SHIFTSDEN=0;
  TMR3_ENBL=0;
  //return back to fast interrupts
  TMR3_COMP12=0;//count up to this then toggle output for testing
  TMR3_CMPLD12=0;//has to be set despite requesting no preload
  TMR3_CNTR2=0;

  if(motor_auto_stop){ setMotorLevel(false, 0); }

  pinMode(8, OUTPUT);
  digitalWrite(8, HIGH);//disable outputs
  pinMode(32, OUTPUT);
  digitalWrite(32, LOW);//reset LDs
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B0_12=5; // ~reset - HW pin 32 to GPIO
  IOMUXC_SW_MUX_CTL_PAD_GPIO_B1_00=5; // ~PCLK - HW pin 8 to GPIO

  if(!eth_led_enable){ ethernetLEDState(true); }
  
  delay(1); //wait a bit for past DMA requests or interrupts to finish

  tmrWriter.disable();
  ldWriter.disable();
  
  display_active=false;
  
  Serial.print(capture_total);
  Serial.println(" samples acquired");
  for(uint32_t n=0;n<opt_edges_nbuf;n++){
    Serial.print(capture_timestamps[n]);
    Serial.print(',');
  }
  Serial.println();
  for(uint32_t n=0;n<opt_edges_thetas;n++){
    Serial.print(1000.0f*thetas[n]);
    Serial.print(',');
  }
  Serial.println();
  capture_ts_write=0;
  opt_state=0;//reset theta observer
}

void preload_timer(){ //interrupt gets called upon end (can also enable halfway) of DMA reads from timer_buffer, need to refill the entire buffer for DMA to continue
  //if(tmrWriter.complete()){
    //end of buffer reached, re-fill from beginning
    //note this deermination may not be reliable if the interrupt is delayed as this bit is cleared by next DMA request, so it is better to rely on target pixel offset to determine where to write in buffer
    //tmrWriter.clearComplete();//this may cause errors due to different module clocks, no benefit to clearing it
  //}
  //digitalWriteFast(3, HIGH);//for debugging timing
  digitalToggleFast(3);//for testing timing issues
  uint32_t tel=opt2_el;
  opt2_el=0;
  if(tel>opt2_el_max){
    opt2_el_max=tel;
  }
  if(tel<opt2_el_min){
    opt2_el_min=tel;
  }
  bool debugmode=(opt_state!=15);
  if(debugmode){
    //turn on lasers without motor for debugging
    const uint16_t tdif=1172;//trigger every ms or so
    uint16_t tfwd=TMR3_CNTR0+tdif*2;//first new entry in buffer loads next request of CMPLD1 which is 3 pixels forward of current counter
    for(int a=0; a<(timer_buffer_bytes/2); a++){
      tfwd+=tdif;
      timer_buffer[a]=tfwd;
    }
  }else{
    //If the timer just now triggered pixel 0, then COMP1 is for pixel 1, CMPLD1 is for pixel 2, and next buffer entry should be for pixel 3
    uint32_t sv=ld_sync_var;
    //uint16_t ct=TMR3_CNTR0;//most recent counter value, so we do not schedule an update too soon
    //static uint16_t target_pixel=0;//in pixel frame 0-511
    const uint32_t pixel_s=opt_edges_dec+(9-opt_edges_cirs);//account for coarser discretization of thetas vs pixels
    const uint32_t pixel_sd=1<<pixel_s;//divider
    const float pixel_fsd=1.0f/(float)pixel_sd;//fraction of reduced theta angle that is taken up by 1 pixel step
    uint16_t ref_theta=(uint16_t)(((sv&0xFFFF)>>((16-opt_edges_cirs)+opt_edges_dec))<<pixel_s);//shifted to pixel frame 0-511
    uint32_t cc=(uint32_t)(sv&(0xFFFF>>(opt_edges_cirs-opt_edges_dec)));//timer ticks per reduced theta step
    uint16_t tref=(uint16_t)(sv>>16);//timer reference counter at reference theta
    float ref_pixel=((float)ref_theta)+theta_pixel_shift+theta_pixel_vshift/cc;//pixel value corresponding to reference theta
    float d_pixel=(target_pixel-ref_pixel);
    while(d_pixel>=512){
      d_pixel-=512;
    }
    while(d_pixel<0){//wrap around overflow so that target is always in the future
      d_pixel+=512;
    }
    //uint16_t tnext=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);
    //uint16_t tfwd=tnext-ct;//avoid updating if there will not be enough time
    uint16_t tfwd=(uint16_t)round(cc*d_pixel*pixel_fsd);
    // noting buffer is 3 forward of present pixel, and a bit extra room (but not to exceed width of pulse)
    uint16_t tefwd=timer_buffer[(timer_buffer_bytes/2)-1]-tref;//next loaded pixel firing time (this should be faster than calling TMR3_CMPLD10)
    uint32_t tefwd2=tefwd;
    if(tfwd<(tefwd2+22)){
      c[0]++;
      d_pixel+=512;//scheduled update would be too soon, so instead skip one circle
    }
    /*if(d_pixel>64 || tfwd<36){
      c[0]++;//record error
      c[1]=d_pixel;
      c[2]=tfwd;
      c[3]=target_pixel;
      //too far ahead (likely because we missed an update step) or too soon (because we are about to miss an update step)
      //resynchronize to actual next pixel (and skip some data) to avoid long loops of trying to catch up
      tnext=ct+36;//next reasonable update increment, find next pixel time after that and determine which pixel number that is
      tfwd=tnext-tref;
      float tffwd=tfwd*32.0f;//(((uint32_t)tfwd)<<pixel_s)
      d_pixel=tffwd/((float)cc);//always positive
      target_pixel=ceil(ref_pixel+d_pixel+1024);//new pixel to re-synchronize, add 1024 to ensure this is positive and not rounded to zero
      target_pixel%=512;
      //repeat above process
      d_pixel=(target_pixel-ref_pixel);
      while(d_pixel>=512){
        d_pixel-=512;
      }
      while(d_pixel<0){//wrap around overflow so that target is always in the future
        d_pixel+=512;
      }
      tnext=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);
      tfwd=tnext-ct;//avoid updating if there will not be enough time
      c[4]=d_pixel;
      c[5]=tfwd;
      c[6]=target_pixel;
      TMR3_COMP10=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);//replace next scheduled update to be the new pixel
      d_pixel=d_pixel+1;//no wrap-around here
      TMR3_CMPLD10=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);
      d_pixel=d_pixel+1;//fill rest of DMA buffer synchronized to this point
      //hopefully by now the first brightness step FlexIO DMA has completed (loading in second brightness step)
      // but the second brightness step has not yet triggered, so DMA is now set up to load the next pixel which will be updated to the new pixel
      ldWriter.TCD->SADDR=laser_buffer+(target_pixel*16);//move to new pixel location first brightness step in buffer
      target_pixel+=2;//this corresponds to the first value loaded into buffer below (gets %512 below)
    }*/
    for(int a=0; a<(timer_buffer_bytes/2); a++){//next 32 pixels
      timer_buffer[a]=tref+(uint16_t)round(cc*d_pixel*pixel_fsd);
      d_pixel=d_pixel+1;//no wrap-around here
    }
    target_pixel=(target_pixel+32)%512;//next pixel value to load into buffer
  }

  tel=opt2_el;
  if(tel>opt2_dur_max){
    opt2_dur_max=tel;
  }
  if(tel<opt2_dur_min){
    opt2_dur_min=tel;
  }
  
  //digitalWriteFast(3, LOW);//for debugging timing
  
  tmrWriter.clearInterrupt();
  asm volatile ("dsb");//memory barrier to ensure no re-entry (which may cause writing the buffer twice)
}
