	device	zxspectrum48
	org	#8200

start:
	di
	ld	ix,16384
	ld	de,6912
	ld	a,255
	scf
	call	ld_bytes
	ret	c		;return on success
	rst	8
	db	26		;print an error message


ld_bytes:
	ex	af,af'		;Save the "load/verify" flag and the type code
	ld	hl,053fh	;the sa/ld-ret ROM subroutine to restore border color and handle break key
	push	hl
	push	de		;file length will be in de' in this loader
	exx
	pop	de
	ld	bc,0101h	;Initial value of the Fletcher's checksum
	exx
	in	a,(0feh)
	rra
	and	20h		;keep only the ear bit
	or	2		;red border
	ld	c,a
	cp	a		;set zero flag
	di
ld_break:
	ret	nz
ld_start:
	call	ld_edge_1	;carry flag reset means no edge
	jr	nc,ld_break	;try again if no edge
; Wait a while and check if the signal is still pulsing
ld_badleader:
	ld	hl,0415h
ld_wait:
	djnz	ld_wait
	dec	hl
	ld	a,h
	or	l
	jr	nz,ld_wait
	call	ld_edge_2	;now check for two edges within allowed time
	jr	nc,ld_break	;no edges
; Now accept only the leader signal. Measure distance between edges to start-up the PLL
	ld	h,0
	ld	de,0		;edge-distance measurement will be accumulated here
; ld_leader loop time = 85+t(ld_edge_1)
ld_leader:
	ld	b,0CEH		;7 timing constant (max = 50)
	call	ld_edge_1       ;17 + t(ld_edge1)
	jr	nc,ld_break	;7/12 two edges must be found within time limit
	ld	a,b		;4
	cp	0D7H		;7 but within a distance from each other
	jr	c,ld_badleader	;7/12
	add	a,e		;4
	ld	e,a		;4
	adc	a,d		;4
	sub	e		;4
	ld	d,a		;4 16-bit addition
	inc	h		;4 count edge pairs in H until 256 pairs have been found
	jr	nz,ld_leader	;7/12
; After the leader there is a sync pulse expected, so the remainder of the leader will be loaded here
; The period is now measured so we can start up the PLL and run it for achieving a better phase lock accuracy.
; Recalculate the measured period into PLL state value based on the following:
; - ld_edge_1 b->time calculation: t = 469+(db-1)*65+tbc, tbc=time between calls
; - time between calls is 85, therefore t = 554+(db-1)*65 from the ld_leader loop, for two bits
; - ld_bit is called every 618+2*(d-1)*13 Tcy
; - to convert db into d, the equation is 554+(db-1)*65=1236+4*(d-1)*13, -682+(db-1)*65=52*(d-1), d=1-682/52+(db-1)*1.25
;   we calculate it in fixed point as ((db-1)*5-52)/4+1
	ld	a,d		;4
	sub	0cfh		;7 get absolute value of db without offset and subtract one
	ld	d,a		;4
	ld	h,d		;4
	ld	l,e		;4
	add	hl,hl		;11
	add	hl,hl		;11
	add	hl,de		;11 multiply by 5
	ld	a,h		;4 get (db-1)*5
	sub	48		;7 48=52-4, this is to add one
	sla	a		;8 the result must be divided by 4 and mult by 8, therefore one shift left
	ld	d,a		;4 the initial VCO value
; time from the last signal edge to here is 95+56+79 Tcy = 230 Tcy
; from call to ld_grbits to first in t=39+(d-1)*13+17
; in ld_sync loop before call to ld_grbits (including call): 55
; complete waiting for two bit times which is 1236+4*(d-1)*13
; Remaining: 951+4*(d-1)*13
	ld	a,d		;4
	rra			;4
	and	7Fh		;7
	ld	b,a             ;4
ld_sync0:
	djnz	ld_sync0	;8/13
; completed waiting for 27+4*(d-1)*13 Tcy, remaining: 924, do some work as well
	ld	a,c		;4
	rla			;4 get the current signal status
	sbc	a,a		;4 extend it
	ld	l,a		;4 store for ensuring that 1100 or 0011 groups are loaded
	ld	b,69		;7
ld_sync1:
	djnz	ld_sync1	;8/13
	xor	a		;4 delay and set Z flag
	ret	nz		;5 delay (will never return)
; time between calls to ld_grbits must be 84, same as in ld_loop and ld_byte
ld_sync:
	ld	a,07fh		;7
	in	a,(0feh)	;11
	rra			;4
	ret	nc		;5/11 return if break key was pressed
	ld	h,l		;4 save the two previous bits in h
	ld	l,40h		;7 load 2 bits to see if they are the same
	call	ld_grbits	;17 + t(ld_grbits)
	ld	a,l		;4
	xor	h		;4 compare with previous 2 data bits
	jr	z,ld_badleader	;7/12 abort loading if they are the same (bad leader)
	xor	h		;4 restore the current data bits in A and check parity
	jp	pe,ld_sync	;10 loop until two different bits are loaded (the sync bits)
; A sync sequence was detected (two different bits). Data follows. The last loaded data bit
; becomes optional inversion bit.
	rrca			;4 get it in C flag
	sbc	a,a		;4 extend
	xor	c		;4 invert last data bit stored in C(0) if optional inversion bit becomes set
	ld	c,a		;4 last data bit (possibly inverted) is always 0 here
; Load the file type flag before entering the main loop
; time from last call to ld_grbits: 45, should be 43, 2tcy mismatch will hopefully not hurt too much
	call	ld_byte		;17
	ret	nc		;5/11 return on error
	ex	af,af'		;4
	xor	e		;4 warning: CF (verify) is erased here
	ret	nz		;5/11 flag value mismatch - error return (CF=0 after xor)
	ex	af,af'		;4
; Main loading loop
; Timing:
; From first call to ld_bit to first call to ld_grbits (incl call ld_grbits) - 109
; Between calls to ld_grbits (incl call) - 84
; From last call to ld_grbits to second call to ld_bit (incl call ld_bit) - 101
; From second call to ld_bit to first call to ld_bit (incl first call) - 126
ld_loop:
	ld	l,10h		;7 set stop-bit to load 4 encoded bits
	call	ld_bit		;17
	rl	l		;8
; Update fletcher's checksum and dec byte counter on the way
	ld	a,e		;4 previously loaded and decoded data byte
	exx			;4
	add	a,c		;4
	adc	a,0		;7 modulo 255 addition
	ld	c,a		;4
	add	a,b		;4
	adc	a,0		;7 modulo 255 addition
	ld	b,a		;4
	dec	de		;6
	exx			;4
	ld	a,7fh		;7
	in	a,(0feh)	;11
	rra			;4
	ret	nc		;5/11 return if break key was pressed
	ret	nc		;5 delay
	nop			;4
	call	ld_grbits	;17
	ld	h,high coding_tables ;7
	ld	a,low coding_tbl_4b ;7
	add	a,l		;4
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4 get status bit in C
	ccf			;4 on error, return with CF=0
	ret	nc		;5/11 loading errors are detected on excluded codes
	rlca			;4
	ld	e,a		;4
	jp	$+3		;10 delay
	ld	l,10h		;7 set stop-bit to load 4 encoded bits

	call	ld_grbits	;17
	ld	a,low coding_tbl_4b	;7
	add	a,l		;4
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4
	ccf			;4
	ret	nc		;5/11
	ret	nc		;5 delay
	rrca			;4
	or	e		;4
	rlca			;4
	rlca			;4
	ld	e,a		;4
	ld	l,10h		;7 set stop-bit to load 4 encoded bits

	call	ld_grbits	;17
	ld	a,low coding_tbl_4b	;7
	add	a,l		;4
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4
	ccf			;4
	ret	nc		;5/11
	ret	nc		;5 delay
	rrca			;4
	or	e		;4
	rlca			;4
	rlca			;4
	ld	e,a		;4
	ld	l,20h		;7 set stop-bit to load 3 encoded bits

	call	ld_grbits	;17
	ld	b,6		;7 delay 84 t-states
ld_loop_delay0:
	djnz	ld_loop_delay0	;8/13
	nop			;4
	call	ld_bit		;17 load the fourth bit separately to save time
	ld	a,l		;4
	rla			;4 shift in the last data bit
	add	a,low coding_tbl_4b ;7
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4 get status bit in C
	ccf			;4
	ret	nc		;5/11
	rrca			;4 ARTEM this instruction breaks timing
	or	e		;4
	ld	e,a		;4
	ld	(ix),a		;19
	inc	ix		;10
	exx			;4
	ld	a,d		;4
	or	e		;4
	exx			;4
	jp	nz,ld_loop	;10

; Data loading is finished - load and check the Fletcher's checksum
; Timing:
; From last call to ld_bit to first call to ld_bit (incl cur call) - 126
; From first call to ld_bit to first call to ld_byte0 (incl call) - 92
; From call to ld_byte0 to call to ld_bit - 46 (incl call)
; From second call to ld_bit to second call to ld_byte0 (incl call) - 92
	ld	l,10h		;7
	call	ld_bit		;17 must use direct path here because timing is tight after the loop
	rl	l		;8
; Update Fletcher's checksum with the last loaded data byte
	ld	a,e		;4 previously loaded and decoded data byte
	exx			;4
	add	a,c		;4
	adc	a,0		;7 modulo 255 addition
	ld	c,a		;4
	add	a,b		;4
	adc	a,0		;7 modulo 255 addition
	ld	b,a		;4
	exx			;4
	jr	$+2		;12 25 t-states delay
	xor	a		;4 delay and prepare z flag for the next instruction
	ret	nz		;5 delay (will never return)
	nop			;4
	call	ld_byte0	;17 second entry into ld_byte
	ret	nc		;5 return on error
	exx			;4
	xor	c		;4 compare first (lsb) byte and reset the C flag
	exx			;4
	ret	nz		;5/11 return error status on mismatch
; Timing is tight again - we must use the direct entry to ld_bit
	ld	l,10h		;7
	call	ld_bit		;17
	rl	l		;8
; Delay 67 t-states
	ld	b,5		;7
ld_check_delay0:
	djnz	ld_check_delay0	;8/13
	call	ld_byte0	;17 second entry into ld_byte
	ret	nc		;5 return on error
	exx			;no more timing - loading is finished now
	xor	b
	exx
	ret	nz		;return error status on mismatch
	scf			;loading successful - checksum matches
	ret


; Routine to load a data byte
; it is not used within the main loop because of timing optimization
; in d - VCO state, C - optional inversion flag
; out A,E - loaded data bit, CF=1 on success, 0 on failure
; Timing:
; From entry to first call to ld_grbits (including call) - 24
; From alt entry to first call to ld_grbits (including call) - 17
; Between calls to ld_grbits (including second call) - 84
; From last call to ld_grbits to exit - 55
ld_byte:
	ld	l,10h		;7 stop-bit to load 4 encoded bits
; extra entry point for the checksum loading path
ld_byte0:
	call	ld_grbits	;17
	ld	h,high coding_tables	;7
	ld	a,low coding_tbl_4b	;7
	add	a,l		;4
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4
ccret:	ccf			;4
	ret	nc		;5/11 loading errors are detected on excluded codes
	rlca			;4
	ld	e,a		;4
	jp	$+3		;10 delay
	ld	l,10h		;7 set stop-bit to load 4 encoded bits

	call	ld_grbits	;17
	ld	a,low coding_tbl_4b	;7
	add	a,l		;4
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4
	ccf			;4
	ret	nc		;5/11
	ret	nc		;5 delay
	rrca			;4
	or	e		;4
	rlca			;4
	rlca			;4
	ld	e,a		;4
	ld	l,10h		;7 set stop-bit to load 4 encoded bits

	call	ld_grbits	;17
	ld	a,low coding_tbl_4b	;7
	add	a,l		;4
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4
	ccf			;4
	ret	nc		;5/11
	ret	nc		;5 delay
	rrca			;4
	or	e		;4
	rlca			;4
	rlca			;4
	ld	e,a		;4
	ld	l,10h		;7 set stop-bit to load 4 encoded bits

	call	ld_grbits	;17
	ld	a,l		;4
	add	a,low coding_tbl_4b ;7
	ld	l,a		;4
	ld	a,(hl)		;7 decode the group of 4 bits
	rrca			;4 get status bit in C
	jr	c,ccret		;7/12 this will return with C flag reset on error
	rrca			;4 ARTEM this instruction breaks timing
	or	e		;4
	ld	e,a		;4
	scf			;4
	ret			;10

; Load a group of bits indicated by initial state of L register ('1' indicates stop-bit)
; Timing:
; Time between calls to ld_bit - 126+t(ld_bit)
; From entry to first call to ld_bit - 17
; From last call to ld_bit to exit - 25
; From exit to entry (via ld_loop) - 84
ld_grbits0:
	;delay 89 t-states
	ld	b,5		;7
ld_grbits1:
	djnz	ld_grbits1	;8/13
	jr	$+2		;12
	jp	$+3		;10
ld_grbits:
	call	ld_bit		;17
	rl	l		;8 store the current bit into the input shift register
	jr	nc,ld_grbits0	;7/12 loop until stop bit is encountered
	ret			;10

; Load a single bit, update PLL state
; in/out d - PLL state
; in c(5) - optional inversion bit
; in c(0) - previous data bit
; out c(0) - current data bit
; out CF=1 - success, 0 - break key pressed
; Timing:
; from entry to first in - 42+(d-1)*13 + ((d>>1)&1)*5
; from first in to second in - 306+(d-1)*13 + ((d>>1)&1)*5 + +((d>>2)&1)*13
; from second in to exit (including ret) - 138 or 185 if phasedelay branch was taken
; from exit to entry (via ld_grbits) - 126
ld_bit:
	ld	a,d		;4 Current VCO value
	rra			;4
	rra			;4
	jr	c,$+2		;7/12 variable delay depending on bit 1 of D
	rra			;4
	and	1Fh		;7
	ld	b,a             ;4
ld_dhbt:
	djnz	ld_dhbt		;8/13 Delay for 1/2 bit time
	in	a,(0feh)	;11
	rra			;4
	xor	c		;4 process optional inversion
	and	20h		;7 extract the Ear bit
	add	a,0E0h		;7 move it into C flag (when a=20h, there will be a carry)
	rl	c		;8 store the phase detector bit into C register, bit 0

; Out the previous data bit into border
	ld	a,c		;4
	rra			;4
	rra			;4
	sbc	a,a		;4 previous bit went into C flag
	and	7		;7
	xor	9		;7
	out	(0feh),a	;11

	ld	b,13		;7 Make a delay of 175 t-states to equalize bit sampling times
ld_ibdl:
	djnz	ld_ibdl		;8/13
	nop			;4

	ld	a,d		;4 Current VCO value
	rra			;4
	rra			;4
	jr	c,$+2		;7/12 variable delay depending on bit 1 of D
	rra			;4
	adc	a,0		;7 round up the value in D, for higher granularity in VCO speed
	and	1Fh		;7
	ld	b,a		;4
ld_dhbt2:
	djnz	ld_dhbt2	;8/13 Delay for another 1/2 bit time
	in	a,(0feh)	;11
	xor	c		;4 process optional inversion (no rra but rl c - positions match)
	rla			;4 move the Ear bit to bit 7
	rla			;4 move the Ear bit out to CF
	rl	c		;8 store the current data bit in C, bit 0
; Compare previous data bit with current data bit
	ld	a,c		;4
	and	5		;7 extract bits 0 and 2 (current and previous data bits)
	jp	pe,ld_samebit	;10 skip if there was no signal change
; Compare current data bit with phase detector bit
	rla			;4 shift current data bit into bit 1
	xor	c		;4 xor it with phase detector bit
	and	2		;7 extract the difference bit: 2 means lead, 0 means lag
	jr	nz,phadelay	;7/12 delay 7 or 34 t-states to compensate lead
nophadelay:
	dec	a		;4 now it becomes 1 or -1
; Now we need to either add or subtract 1 to the VCO
	add	a,d		;4
	sub	8h		;7 clip at 8h
	adc	a,8h		;7
	ld	d,a		;4
; Restore C register for the next call and return	
; Move back the optional inversion bit by 2 positions, keep current bit at 0 position
ld_bitfin:
	ld	a,c		;4
	rrca			;4 current bit goes into bit 7
	rra			;4 replicate current bit in b7
	sra	a		;8 replicate current bit again in b7
	rlca			;4 current bit goes back to position 0 with optinv bit restored
	ld	c,a		;4
	ret			;10
; Equalize timing of the PLL update branch - 48 t-states
ld_samebit:
	jr	$+2		;12
	jr	$+2		;12
	jr	$+2		;12
	jr	ld_bitfin	;12

; Delay 22 t-states before arriving to nophadelay
phadelay:	
	jp	$+3		;10
	jr	nophadelay	;12

;------ Standard ld-edge routines
; timing for ld_edge1:
; delay - 16*(n-1) + 18, when n=16h t=358
; from entry to first in - 374
; between successive ins - 65=5*13
; when edge detected, from last in to exit (incl ret) - 95
; b->time calculation: t = 469+(db-1)*65+tbc, tbc=time between calls
ld_edge_2:
	call	ld_edge_1	;17
	ret	nc		;5/11
ld_edge_1:
	ld	a,16h		;7 wait 358 t-states before sampling loop
ld_delay:
	dec	a		;4
	jr	nz,ld_delay	;7/12
	and	a		;4
ld_sample:
	inc	b		;4
	ret	z		;5/11 return CF=0, ZF=1 if timeout
	ld	a,7fh		;7 check Break key & Ear signals
	in	a,(0feh)	;11
	rra			;4
	jr	nc,p_ret	;7/12 return CF=0, ZF=0 if break key was pressed (jr to burn 2tcy)
	nop			;4
	xor	c		;4
	and	20h		;7
	jr	z,ld_sample	;7/12 loop until edge is detected
	ld	a,c		;4
	cpl			;4
	ld	c,a		;4
	and	7		;7
	or	8		;7 signal Mic Off
	out	(0feh),a	;11
	scf			;4 signal success
p_ret:	ret			;10


	;org	#8400

coding_tables:


coding_tbl_4b:
	db	00000001b	;0000 - error
	db	00000001b	;0001 - error  
	db	00001000b	;0010 - "10"  
	db	00001100b	;0011 - "11"  
	db	00000100b	;0100 - "01"  
	db	00000000b	;0101 - "00" 
	db	00000001b	;0110 - error 
	db	00000001b	;0111 - error  
	db	00000001b	;1000 - error  
	db	00000001b	;1001 - error 
	db	00000000b	;1010 - "00" 
	db	00000100b	;1011 - "01"  
	db	00001100b	;1100 - "11"   
	db	00001000b	;1101 - "10"  
	db	00000001b	;1110 - error  
	db	00000001b	;1111 - error


	savebin "loader.bin",start,0x250

