/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ Permission to use, copy, modify, and/or distribute this software for         │
│ any purpose with or without fee is hereby granted, provided that the         │
│ above copyright notice and this permission notice appear in all copies.      │
│                                                                              │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
│ PERFORMANCE OF THIS SOFTWARE.                                                │
╚─────────────────────────────────────────────────────────────────────────────*/

//	@fileoverview Binary Lambda Calculus Virtual Machine
//	              In a 383 byte Linux x64 ELF executable
//
//	The Lambda Calculus is a mathematical language with 1 keyword.
//	It's a Turing tarpit, discovered by Turing's doctoral advisor.
//	This is a Church-Krivine-Tromp machine with ASCII oriented IO.
//	It implements garbage collection, lazy lists & tail recursion.
//	It extracts only the least significant bit from an stdin byte.
//	Output is only 0 and 1 bytes. It's slow but easy for learning.
//	Displacement is limited to [0,255] so progs can't be too huge.
//	Your ASCII binary serialization format, is defined as follows:
//
//	    00      means abstraction   (pops in the Krivine machine)
//	    01      means application   (push argument continuations)
//	    1⋯0     means variable      (with varint de Bruijn index)
//	    000010  e.g. means λλ0
//	    0000110 e.g. means λλ1
//
//	Your virtual machine may be compiled as follows:
//
//	    cc -no-pie -static -nostdlib -Wl,-oformat:binary -o blc blc.S
//
//	Your program is read from stdin followed by its input. Here's
//	a simple tutorial using the identity function (λ 0), which is
//	encoded as (00 10) in the binary lambda calculus:
//
//	    $ { printf 0010; printf 0101; } | ./blc; echo
//	    0101
//
//	You can use sed shell scripts as a byte code compiler. All it
//	has to do is `s/λ/00/g`, `s/\[/01/g`, `s/[^01]//g`, etc.
//
//	    #!/bin/sh
//	    tr \\n n |
//	      sed '
//	          s/;.*//
//	          s/#.*//
//	          s/1/⊤/g
//	          s/0/⊥/g
//	          s/λ/00/g
//	          s/\[/01/g
//	          s/9/11111111110/g
//	          s/8/1111111110/g
//	          s/7/111111110/g
//	          s/6/11111110/g
//	          s/5/1111110/g
//	          s/4/111110/g
//	          s/3/11110/g
//	          s/2/1110/g
//	          s/⊤/110/g
//	          s/⊥/10/g
//	          s/[^01]//g
//	        '
//
//	We can now write nicer looking programs:
//
//	    { printf %s "(λ 0)" | ./compile.sh
//	      printf 0101
//	    } | ./blc
//
//	This program means exit(0) because it returns `$nil` or `[]`:
//
//	    λ λλ0
//
//	Here's some important values:
//
//	    nil="λλ0"
//	    false="λλ0"
//	    true="λλ1"
//
//	Here's some important abstractions:
//
//	    if="λ 0"
//	    pair="λλλ [[0 2] 1]"
//	    car="λ [0 $true]"
//	    cdr="λ [0 $false]"
//	    or="λλ [[1 1] 0]"
//	    and="λλ [[1 0] 1]"
//	    not="λλλ [[2 0] 1]"
//	    xor="λλ [[1 λλ [[2 0] 1]] 0]"
//	    iszero="λλλ [[2 λ 1] 1]"
//	    Y="λ [λ [0 0] λ [1 [0 0]]]"
//
//	Here are your integers:
//
//	    zero="λλ 0"
//	    one="λλ [1 0]"
//	    two="λλ [1 [1 0]]"
//	    three="λλ [1 [1 [1 0]]]"
//	    four="λλ [1 [1 [1 [1 0]]]]"
//	    five="λλ [1 [1 [1 [1 [1 0]]]]]"
//	    six="λλ [1 [1 [1 [1 [1 [1 0]]]]]]"
//	    seven="λλ [1 [1 [1 [1 [1 [1 [1 0]]]]]]]"
//	    eight="λλ [1 [1 [1 [1 [1 [1 [1 [1 0]]]]]]]]"
//	    nine="λλ [1 [1 [1 [1 [1 [1 [1 [1 [1 0]]]]]]]]]"
//
//	Here's some arithmetic:
//
//	    pow="λλ [0 1]"
//	    mul="λλλ [2 [1 0]]"
//	    sub="λλ [[0 $dec] 1]"
//	    inc="λλλ [1 [[2 1] 0]]"
//	    dec="λλλ [[[2 λλ [0 [1 3]]] λ 1] λ 0]"
//	    add="λλλλ [[3 1] [[2 1] 0]]"
//	    fac="λλ [[[1 λλ [0 [1 λλ [[2 1] [1 0]]]]] λ1] λ0]"
//	    min="λλλλ [[[3 λλ [0 1]] λ1] [[2 λλ [3 [0 1]]] λ1]]"
//	    div="λλλλ [[[3 λλ [0 1]] λ 1] [[3 λ [[[3 λλ [0 1]] λ [3 [0 1]]] λ0]] 0]]"
//	    mod="λλλλ [[[3 $cdr] [[3 λ [[[3 λλλ [[0 [2 [5 1]]] 1]] λ1] 1]] λ1]] λλ0]"
//
//	Here's some predicates:
//
//	    eq="λλ [[[[1 λ [[0 λ0] λ0]] [[0 λλλ [1 2]] λλ0]] λλλ0] λλ1]"
//	    le="λλ [[[1 λλ [0 1]] λλλ1] [[0 λλ [0 1]] λλλ0]]"
//	    lt="λλ [[[0 λλ [0 1]] λλλ0] [[1 λλ [0 1]] λλλ1]]"
//	    odd="λ [λ [0 0] λλ [[0 λλ 1] λ [[0 λλ 0] [2 2]]]]"
//	    divides="λλ [[[1 $cdr] [λ [0 0] λ[[[1 λλλ [[0 [2 λλ0]] 1]] λ[1 1]] λλ1]]] λλ0]"
//
//	This program returns `[0, 1]` so it prints `10`.
//
//	    λ [[$pair $false] [[$pair $true] $nil]]
//
//	This program means if (1 - 1 == 0) putc('1') else putc('0');
//
//	    λ [[[$if [$iszero [[$sub $one] $one]]]
//	          [[$pair $false] $nil]]
//	       [[$pair $true] $nil]]
//
//	This program does the same thing as the ident program but
//	is more spelled out. The two arguments the runtime passes
//	are `gro` and `put` (or `λ [[0 wr0] wr1]`).  Index 110 is
//	is the outer parameter and 10 is the inner parameter.  So
//	this program is the same as doing `for (;;) putc(getc())`
//
//	    λλ [1 0]
//	    ││
//	    │└binds `put` or `(λ 0 wr0 wr1)` [cited as 0]
//	    └binds `gro` or `⋯` [cited as 1]
//
//	This will invert a stream of bits using the Y combinator.
//	It's got a whopping 16kBps of throughput.
//
//	    # a.k.a. Y(λab.(λc.c)b(λcde.❬¬c,ad❭)⊥)
//	    [$Y λλ [[[$if 0] λλλ [[$pair [$not 2]] [4 1]]] $nil]]
//	        ││           │││
//	        ││           ││└consumes $nil terminator [uncited]
//	        ││           │└binds 𝑝 input bit [cited as 1]
//	        ││           └binds (λ 0 𝑝 ⋯) [cited as 2]
//	        │└binds gro (λ 0 𝑝 ⋯) [cited by first 0]
//	        └binds recursive function [cited as 4]
//
//	This program means for x in reversed(stdin): put(x)
//
//	    # a.k.a. λa.a(ω(λbcde.d(bb)(λf.fce)))⊥
//	    λ [[0 [λ [0 0] λλλλ [[1 [3 3]] λ [[0 3] 1]]]] $nil]
//
//	This program means ['1'] * 4**3 times:
//
//	    λ [[$Y λλ [[[$if [$iszero 0]]
//	                    $nil]
//	                 [[$pair $false]
//	                  [1 [$dec 0]]]]]
//	       [[$pow $four] $three]]
//
//	If you need to exponentiate bigger numbers like 9**3 then you'll
//	likely need to tune the STACK parameter below, to mmap something
//	bigger than what the operating system provides by default.
//
//	Your VM expands your program on startup as follows:
//
//	    𝑝 ⟶ [λ [0 λ [[0 wr0] wr1]] [𝑝 ⋯]]
//
//	The lazy list convention reduces as follows:
//
//	    ⋯ ⟹ $nil                     ;; if eof / error
//	    ⋯ ⟹ λ [[0 $false] ⋯]         ;; if ~getc() & 1
//	    ⋯ ⟹ λ [[0 $true] ⋯]          ;; if  getc() & 1
//
//	The `wr0` and `wr1` conventions reduce as follows:
//
//	    wr0 ⟹ λ [0 λ [[0 wr0] wr1]]  ;; w/ putc(0) side-effect
//	    wr1 ⟹ λ [0 λ [[0 wr0] wr1]]  ;; w/ putc(1) side-effect
//
//	Here's a BLC interpreter written in BLC which is 232 bits.
//
//	    [[λ [0 0]
//	      λλλ [[[0 λλλλ [2 λ [[4 [2 λ [[1 [2 λλ [2 λ [[0 1] 2]]]]
//	                                   [3 λ [3 λ [[2 0] [1 0]]]]]]]
//	                          [[0 [1 λ [0 1]]]
//	                           λ [[3 λ [3 λ [1 [0 3]]]] 4]]]]]
//	            [2 2]] 1]]
//	     λ [0 [λ [0 0] λ [0 0]]]]
//
//	@see	https://tromp.github.io/cl/Binary_lambda_calculus.html
//	@see	https://www.ioccc.org/2012/tromp/hint.html

#define TRACE   0		// enable ./trace.sh support
#define FASTR   0		// favor perf over tininess
#define TERMS	5000000		// number of words of bss
#define STACK	0		// bytes of stack to get

#define IOP	0		// code for read, write0, write1, flush
#define VAR	1		// code for variable name lookup
#define APP	2		// code for applications
#define ABS	3		// code for abstractions

#define NEXT	0*8
#define ENVP	1*8
#define REFS	2*8+0
#define TERM	2*8+4

#define envp	%rbp
#define contp	%r9
#define frep	%r8
#define eof	%r13
#define eofb	%r13b
#define eofd	%r13d
#define idx	%rbx
#define idxb	%bl
#define idxd	%ebx

	.macro	pushpop constexpr:req register:req
	.byte	0x6a,\constexpr
	pop	%r\register
	.endm

	.macro	mxchg register:req memory:req
#if FASTR
	mov	\register,%rax
	mov	\memory,\register
	mov	%rax,\memory
#else
	xchg	\register,\memory
#endif
	.endm

	.macro	stlog ordinal:req	# strace logging
#if TRACE
	push	%rax
	push	%rcx
	pushpop	\ordinal,ax
	syscall
	pop	%rcx
	pop	%rax
#endif
	.endm
	.macro	getpid
	stlog	0x27
	.endm
	.macro	getuid
	stlog	0x66
	.endm
	.macro	getgid
	stlog	0x68
	.endm
	.macro	getppid
	stlog	0x6e
	.endm
	.macro	geteuid
	stlog	0x6b
	.endm
	.macro	getegid
	stlog	0x6c
	.endm

	.bss
	.zero	TERMS
	.previous

ehdr:	.ascii	"\177ELF"

////////////////////////////////////////////////////////////////////////////////
//	TWELVE BYTE OVERLAP		#
//	.byte	2			# EI_CLASS is ELFCLASS64
//	.byte	1			# EI_DATA is ELFDATA2LSB
//	.byte	1			# EI_VERSION is 1
//	.byte	3			# EI_OSABI is ELFOSABI_LINUX
//	.quad	0			#
kRom1:	.byte	ABS			#  0       (λ ((0 (λ (λ ?))) ⋯))
	.byte	  APP			#  1       8
	.byte	  8			#──2──┐    -
	.byte	    APP			#  3  │    (0 (λ (λ ?)))
	.byte	    2			#──4────┐  (read (λ (λ ?)))
	.byte	      VAR		#  5  │ │  0
	.byte	      0			#  6  │ │  read
	.byte	    ABS			#──7────┘  (λ (λ ?))
	.byte	      ABS		#  8  │    (λ ?)
	.byte	        VAR		#  9  ┴    ?
	.byte	0			# exit(0) %al
	.byte	0			# elf padding                     [mark]
////////////////////////////////////////////////////////////////////////////////

ehdr2:	.word	2			# e_type is ET_EXEC           [precious]
	.word	62			# e_machine is EM_X86_64      [precious]

////////////////////////////////////////////////////////////////////////////////
//	FOUR BYTE OVERLAP		#
//	.long	1			# e_version is 1                  [mark]
Bye2:	pop	%rax			# __NR_exit
	syscall				#
	.byte	0			# elf padding
////////////////////////////////////////////////////////////////////////////////

ehdr3:	.quad	_start			# e_entry                     [precious]
	.quad	phdrs - ehdr		# e_phoff is 56               [precious]

////////////////////////////////////////////////////////////////////////////////
//	FOURTEEN BYTE OVERLAP		#
//	.quad	0xc681c031		# e_shoff  [should be 0]          [mark]
//	.long	0xfce2abac		# e_flags  [should be 0]          [mark]
//	.word	0xc3			# e_ehsize [should be 64]         [mark]
Get:	push	%rdi			#
	xor	%eax,%eax		# __NR_read
	xor	%edi,%edi		# stdin
	mov	$buf,%esi		# buf
	syscall				#
	jmp	Get2			#
////////////////////////////////////////////////////////////////////////////////

	.word	56			# e_phentsize                 [precious]

////////////////////////////////////////////////////////////////////////////////
//	EIGHT BYTE OVERLAP		#
//	.word	1			# e_phnum              [correct overlap]
//	.word	0			# e_shentsize          [correct overlap]
//	.word	1|2|4			# e_shnum              [p_flags clobber]
//	.word	0			# e_shstrndx           [correct overlap]
phdrs:	.long	1			# p_type is PT_LOAD
	.long	1|2|4			# p_flags is PF_X|PF_W|PF_R
////////////////////////////////////////////////////////////////////////////////

	.quad	0			# p_offset                    [precious]
	.quad	ehdr			# p_vaddr                     [precious]

////////////////////////////////////////////////////////////////////////////////
//	EIGHT BYTE OVERLAP		#
//	.quad	ehdr			# p_paddr                         [mark]
Get2:	and	%dl,(%rsi)		# 1. al= 1 (si)='0' → ZF=1 CF=1 EAX=0
	sub	%dl,(%rsi)		# 2. al= 1 (si)='1' → ZF=1 CF=0 EAX=0
	dec	%eax			# 3. al= 0 (si)=??? → ZF=0 CF=? EAX<0
	pop	%rdi			# 4. al=-1 (si)=??? → ZF=0 CF=? EAX<0
.Lret:	ret				#
////////////////////////////////////////////////////////////////////////////////

phdrs2:	.quad	filesz			# p_filesz         [insurmountable gulf]
	.quad	memsz			# p_memsz          [insurmountable gulf]
//	.quad	4096			# p_align

Bye:	xchg	%edi,%eax
	shr	$16,%edi
	push	$60			# __NR_exit
	jmp	Bye2
	.size	Bye,.-Bye

Gc:	decl	REFS(%rax)		# unref memory (order matters)
	jnz	.Lret			# 1. free parents via recursion
	push	%rax			# 2. free self
	mov	NEXT(%rax),%rax		# 3. free siblings via iteration
	call	Gc
	pop	%rax
	mov	frep,NEXT(%rax)
	mov	%rax,frep
	mov	ENVP(%rax),%rax
	jmp	Gc
	.size	Gc,.-Gc

Parse:	push	%rdi			# save 1
0:	.byte	0xB0			# lda §movsb,%al (nop next byte)
1:	movsb				# 00 is abstraction
	call	*%r14			# Get
	jnc	2f
	call	*%r14			# Get
	jc	1b
1:	mov	$APP,%al		# 01 is application
	stosb
	push	%rdi			# save 2
	scasb
	call	Parse
	pop	%rsi			# rest 2
	mov	%al,(%rsi)
	jmp	0b
2:	mov	$VAR,%al		# 1⋯ is variable
	stosb				# 0-based de Bruijn indices
	neg	%al
3:	inc	%al
	push	%rax
	call	*%r14			# Get
	pop	%rax
	jnc	3b
	stosb
	pop	%rsi			# rest 1
	mov	%edi,%eax
	sub	%esi,%eax
	ret
	.size	Parse,.-Parse

Var:	getuid
	push	envp
	.byte	0x3D			# cmp §0x6D8B48,%eax (nop 4x)
1:	mov	NEXT(envp),envp
	dec	%ecx
	jns	1b
2:	mov	TERM(envp),idxd
	mov	ENVP(envp),envp
	incl	REFS(envp)
	pop	%rax
	call	Gc
	jmp	Rex
	.size	Var,.-Var

Abs:	getpid
	test	contp,contp
	jz	Bye
	mxchg	envp,NEXT(contp)
	xchg	envp,contp
	jmp	Rex
	.size	Abs,.-Abs

Gro:	call	*%r14			# Get
	pushpop	10,cx
	mov	$kRom1,%esi
 	jz	2f
	add	$7,%esi
2:	mov	$0,%al
	adc	$0,%al
	rep movsb
	stosb
	jmp	Rex
	.size	Gro,.-Gro

_start:
#if STACK
	mov	$STACK,%rsi
	mov	$9,%al			# __NR_mmap
	mov	$3,%dl			# PROT_READ|PROT_WRITE
	mov	$0x0122,%r10w		# MAP_PRIVATE|MAP_ANONYMOUS|MAP_GROWSDOWN
	syscall
	lea	-24(%rax,%rsi),%rsp
	mov	$0,%dl
#endif
	mov	$Get,%r14d		# saves two bytes
	mov	%rsp,envp		# prevent segfaults clobber argv[0]
	inc	%dl			# dx=1 for read() and write()
	mov	$kRom+(kEnd-kRom+1),%edi
	call	Parse			# parse expr (xxx: tight displacement)
	mov	%al,-1(%rsi)
//	jmp	Rex			# sets main() apply length

Rex:	mov	kRom(idx),%eax		# head normal form reduction
	movzbl	%ah,%ecx		# %al should be ∈ {0,1,2,3}
	inc	idxd
	cmp	$APP,%al
	ja	Abs
	je	App
	test	%al,%al
	jnz	Var
//	jmp	Iop
	.size	Rex,.-Rex

Iop:	getppid				# lazy lists like haskell
	dec	idxd
	cmp	$16,idxd		# length of rom
	ja	Gro
//	jmp	Put
	.size	Iop,.-Iop

Put:	add	$35,idxd		# 13,14 += 48,49 or '0','1'
	push	idx
	mov	%rsp,%rsi
	mov	$2,idxb			# λ 0 λ 0 wr0 wr1
  	push	%rdi
	mov	%edx,%edi		# stdout
	mov	%edx,%eax		# __NR_write
	syscall
	pop	%rdi
	pop	%rax			# free stack
	jmp	Rex
	.size	Put,.-Put

App:	getgid
	test	frep,frep
	jnz	1f
	xor	%eax,%eax
	push	%rax			# calloc() on stack lool
	push	%rax
	push	%rax
	mov	%rsp,frep
1:	inc	idxd
	mxchg	contp,NEXT(frep)	# get closure from free list
	xchg	contp,frep
	incl	REFS(contp)		# save machine state
	incl	REFS(envp)
	mov	envp,ENVP(contp)
	add	idxd,%ecx
	mov	%ecx,TERM(contp)
	jmp	Rex
	.size	App,.-App

buf:	.byte	0
kRom:	.byte	APP			#  0         [λ [0 λ [[0 wr0] wr1]] [main ⋯]]
	.byte	.Lloop-1f		#──1─┐
1:	.byte	  ABS			#  2 │       λ [0 λ [[0 wr0] wr1]]
	.byte	    APP			#  3 │       [0 λ [[0 wr0] wr1]]
	.byte	    .Lw01-1f		#──4───┐
1:	.byte	      VAR,0		#  5 │ │     0
.Lw01:	.byte	  ABS			#──7─┴─┘     λ [[0 wr0] wr1]
	.byte	    APP			#  8 │       [[0 wr0] wr1]
	.byte	    4			#─ 9───┐
	.byte	      APP		# 10 │ │     [0 wr0]
	.byte	      1			#─11─────┐   1
	.byte	        VAR		# 12 │ │ │   0
.Lwr:	.byte	      IOP		#─13─────┘   wr0
	.byte	  IOP			#─14───┘     wr1
.Lloop:	.byte	APP			#─15─┘       [main ⋯]
kEnd:

	.globl	ehdr
	.globl	_start
	.type	kRom,@object
	.type	kRom1,@object
	.type	ehdr,@object
	.type	ehdr2,@object
	.type	ehdr3,@object
	.type	phdrs,@object
	.type	phdrs2,@object
	.type	buf,@object
	.weak	filesz
	.weak	memsz
