From 0cf6d7a9af934c76d0aa7d58f46923f8dbbac6d8 Mon Sep 17 00:00:00 2001 From: Puck Meerburg Date: Thu, 1 May 2025 13:20:05 +0000 Subject: [PATCH] (zilch lang ninja): improve parser performance --- lang/ninja/src/ninja.sld | 114 +++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/lang/ninja/src/ninja.sld b/lang/ninja/src/ninja.sld index 2de84a3..f9c5bab 100644 --- a/lang/ninja/src/ninja.sld +++ b/lang/ninja/src/ninja.sld @@ -60,6 +60,13 @@ (and (char>=? ch #\0) (char<=? ch #\9)) (char=? ch #\_) (char=? ch #\-) (and (not simple) (char=? ch #\.)))) + (define (is-valid-varname-byte ch simple) + (or + (and (>= ch #x61) (<= ch #x7A)) + (and (>= ch #x41) (<= ch #x5A)) + (and (>= ch #x30) (<= ch #x39)) + (= ch #x5F) (= ch #x2D) + (and (not simple) (= ch #x2E)))) (define (is-valid-eval-string-char ch path) (not @@ -74,6 +81,19 @@ (char=? ch #\:) (char=? ch #\|)))))) + (define (is-valid-eval-string-byte ch path) + (not + (or + (= ch 0) + (= ch #x0A) + (= ch #x0D) + (= ch #x24) + (and path + (or + (= ch #x20) + (= ch #x3A) + (= ch #x7C)))))) + (define (resolve-evalstring evalstr file rule edge is-command) (define (find-value val) (define res #f) @@ -197,56 +217,68 @@ (try-resolve (build-rule-restat rule) #f) (try-resolve (build-rule-rspfile rule) #f) (try-resolve (build-rule-rspfile-content rule) #f))) - + + (define (bytevector-prefix? bv in-bv index) + (define bv-len (bytevector-length bv)) + (define (inner-loop j) + (cond + ((>= j bv-len) #t) + ((= (bytevector-u8-ref bv j) (bytevector-u8-ref in-bv (+ index j))) + (inner-loop (+ j 1))) + (else #f))) + (if (>= (+ index bv-len) (bytevector-length in-bv)) + #f + (inner-loop 0))) + ;; Reads a full Ninja file, returning a record. (define (read-ninja-file strval) (define i 0) (define (eat-whitespace) (cond - ((>= i (string-length strval))) - ((string-prefix? "$\r\n" strval 0 3 i) (set! i (+ i 3)) (eat-whitespace)) - ((string-prefix? "$\n" strval 0 2 i) (set! i (+ i 2)) (eat-whitespace)) - ((char=? (string-ref strval i) #\space) (set! i (string-skip strval (lambda (ch) (char=? ch #\space)) i)) (eat-whitespace)))) + ((>= i (bytevector-length strval))) + ((bytevector-prefix? #u8(#x24 #x0D #x0A) strval i) (set! i (+ i 3)) (eat-whitespace)) + ((bytevector-prefix? #u8(#x24 #x0A) strval i) (set! i (+ i 2)) (eat-whitespace)) + ((= (bytevector-u8-ref strval i) #x20) (set! i (bytestring-index strval (lambda (ch) (not (= ch #x20))) i)) (eat-whitespace)))) (define (read-eval-string-text path) (define collected '()) (define start-i i) (define (tick) (cond - ((>= i (string-length strval)) (error "unexpected EOF")) - ((is-valid-eval-string-char (string-ref strval i) path) + ((>= i (bytevector-length strval)) (error "unexpected EOF")) + ((is-valid-eval-string-byte (bytevector-u8-ref strval i) path) (let ((start-i i)) - (do () ((or (>= i (string-length strval)) (not (is-valid-eval-string-char (string-ref strval i) path)))) + (do () ((or (>= i (bytevector-length strval)) (not (is-valid-eval-string-byte (bytevector-u8-ref strval i) path)))) (set! i (+ i 1))) - (set! collected (cons (string-copy strval start-i i) collected))) + (set! collected (cons (utf8->string strval start-i i) collected))) (tick)) - ((and path (member (string-ref strval i) '(#\space #\: #\| #\newline)))) - ((and (not path) (char=? #\newline (string-ref strval i))) + ((and path (member (bytevector-u8-ref strval i) '(#x20 #x3A #x7C #x0A)))) + ((and (not path) (= #x0A (bytevector-u8-ref strval i))) (set! i (+ i 1))) - ((string-prefix? "\r\n" strval 0 2 i) + ((bytevector-prefix? #u8(#x0D #x0A) strval i) (unless path (set! i (+ i 2)))) - ((string-prefix? "$\r\n" strval 0 3 i) - (set! i (string-skip strval (lambda (ch) (char=? ch #\space)) (+ i 3))) + ((bytevector-prefix? #u8(#x24 #x0D #x0A) strval i) + (set! i (bytestring-index strval (lambda (ch) (not (= ch #x20))) (+ i 3))) (tick)) - ((string-prefix? "$\n" strval 0 2 i) - (set! i (string-skip strval (lambda (ch) (char=? ch #\space)) (+ i 2))) + ((bytevector-prefix? #u8(#x24 #x0A) strval i) + (set! i (bytestring-index strval (lambda (ch) (not (= ch #x20))) (+ i 2))) (tick)) - ((string-prefix? "${" strval 0 2 i) - (let ((end-of-varname (string-skip strval (lambda (ch) (is-valid-varname-char ch #f)) (+ i 2)))) - (unless (char=? (string-ref strval end-of-varname) #\}) (error "unexpected non-varname character at index" end-of-varname)) - (set! collected (cons (cons 'varname (string-copy strval (+ i 2) end-of-varname)) collected)) + ((bytevector-prefix? #u8(#x24 #x7B) strval i) + (let ((end-of-varname (bytestring-index strval (lambda (ch) (not (is-valid-varname-byte ch #f))) (+ i 2)))) + (unless (= (bytevector-u8-ref strval end-of-varname) #x7D) (error "unexpected non-varname character at index" end-of-varname)) + (set! collected (cons (cons 'varname (utf8->string strval (+ i 2) end-of-varname)) collected)) (set! i (+ end-of-varname 1))) (tick)) - ((or (string-prefix? "$$" strval 0 2 i) (string-prefix? "$:" strval 0 2 i) (string-prefix? "$ " strval 0 2 i)) - (set! collected (cons (string-copy strval (+ i 1) (+ i 2)) collected)) + ((or (bytevector-prefix? #u8(#x24 #x24) strval i) (bytevector-prefix? #u8(#x24 #x3A) strval i) (bytevector-prefix? #u8(#x24 #x20) strval i)) + (set! collected (cons (utf8->string strval (+ i 1) (+ i 2)) collected)) (set! i (+ i 2)) (tick)) - ((and (char=? (string-ref strval i) #\$) (is-valid-varname-char (string-ref strval (+ i 1)) #t)) - (let ((end-of-varname (string-skip strval (lambda (ch) (is-valid-varname-char ch #t)) (+ i 1)))) - (set! collected (cons (cons 'varname (string-copy strval (+ i 1) end-of-varname)) collected)) + ((and (= (bytevector-u8-ref strval i) #x24) (is-valid-varname-byte (bytevector-u8-ref strval (+ i 1)) #t)) + (let ((end-of-varname (bytestring-index strval (lambda (ch) (not (is-valid-varname-byte ch #t))) (+ i 1)))) + (set! collected (cons (cons 'varname (utf8->string strval (+ i 1) end-of-varname)) collected)) (set! i end-of-varname)) (tick)) - ((char=? (string-ref strval i) #\$) + ((= (bytevector-u8-ref strval i) #x24) (error "bad $-escape at index" i)) (else (error "Unknown character at index" i)))) (tick) @@ -263,41 +295,41 @@ (define (read-token) (define start-i i) - (define post-space (or (string-skip strval (lambda (ch) (char=? ch #\space)) i) i)) + (define post-space (or (bytestring-index strval (lambda (ch) (not (= ch #x20))) i) i)) (define token (cond - ((>= i (string-length strval)) + ((>= i (bytevector-length strval)) 'eof) - ((string-prefix? "#" strval 0 1 post-space) - (let ((end-of-comment (or (string-index strval (lambda (ch) (char=? ch #\newline)) post-space) (string-length strval)))) + ((bytevector-prefix? #u8(#x23) strval post-space) + (let ((end-of-comment (or (bytestring-index strval (lambda (ch) (= ch #x0A)) post-space) (bytevector-length strval)))) (set! i end-of-comment) (read-token))) - ((string-prefix? "\r\n" strval 0 2 post-space) + ((bytevector-prefix? #u8(#x0D #x0A) strval post-space) (set! i (+ post-space 2)) 'newline) - ((string-prefix? "\n" strval 0 1 post-space) + ((bytevector-prefix? #u8(#x0A) strval post-space) (set! i (+ post-space 1)) 'newline) ((> post-space i) 'indent) - ((char=? (string-ref strval i) #\=) + ((= (bytevector-u8-ref strval i) #x3D) (set! i (+ i 1)) 'equals) - ((char=? (string-ref strval i) #\:) + ((= (bytevector-u8-ref strval i) #x3A) (set! i (+ i 1)) 'colon) - ((string-prefix? "|@" strval 0 2 i) + ((bytevector-prefix? #u8(#x7C #x40) strval i) (set! i (+ i 2)) 'pipe-at) - ((string-prefix? "||" strval 0 2 i) + ((bytevector-prefix? #u8(#x7C #x7C) strval i) (set! i (+ i 2)) 'pipe-pipe) - ((char=? (string-ref strval i) #\|) + ((= (bytevector-u8-ref strval i) #x7C) (set! i (+ i 1)) 'pipe) - ((is-valid-varname-char (string-ref strval i) #f) - (let* ((token-end (string-skip strval (lambda (ch) (is-valid-varname-char ch #f)) i)) - (token (string-copy strval i token-end))) + ((is-valid-varname-byte (bytevector-u8-ref strval i) #f) + (let* ((token-end (bytestring-index strval (lambda (ch) (not (is-valid-varname-byte ch #f))) i)) + (token (utf8->string strval i token-end))) (set! i token-end) (cond ((string=? token "build") 'build) @@ -401,7 +433,7 @@ ((pool) (error "todo: pool declaration")) ((newline) (read-toplevel file)) (else - (unless (string? token) (error "unexpected" token)) + (unless (string? token) (error "unexpected" (list token i))) (unless (expect-token 'equals) (error "expected =, found" (read-token))) (let* ((value (read-eval-string-text #f)) (resolved (resolve-evalstring value file #f #f #f))) (set-build-file-global-variables! file (mapping-set! (build-file-global-variables file) token resolved)))