From b90e6428964a06ca5db97afdeb43e86127fbed6e Mon Sep 17 00:00:00 2001
From: kaiburjack <kb@hbt.de>
Date: Wed, 2 Feb 2022 20:26:03 +0100
Subject: [PATCH] Integrate bufreader into Decoder

we did reach into the reader's internal buffer
every now and then to speed up scanning through
the buffer. So we might as well just integrate
the few buffer functions into the Decoder and
keep the state there.
---
 bufreader.go |  81 --------------------------
 decoder.go   | 162 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 109 insertions(+), 134 deletions(-)
 delete mode 100644 bufreader.go

diff --git a/bufreader.go b/bufreader.go
deleted file mode 100644
index 1fe9e9d..0000000
--- a/bufreader.go
+++ /dev/null
@@ -1,81 +0,0 @@
-package gosaxml
-
-import (
-	"encoding/binary"
-	"io"
-)
-
-type bufreader struct {
-	buf [4096]byte
-	rd  io.Reader
-	r   int
-	w   int
-}
-
-func (b *bufreader) read0() error {
-	if b.r > 0 {
-		copy(b.buf[:], b.buf[b.r:b.w])
-		b.w -= b.r
-		b.r = 0
-	}
-	n, err := b.rd.Read(b.buf[b.w:])
-	b.w += n
-	if n <= 0 && err != nil {
-		return err
-	}
-	return nil
-}
-
-func (b *bufreader) readByte() (byte, error) {
-	for b.r == b.w {
-		err := b.read0()
-		if err != nil {
-			return 0, err
-		}
-	}
-	c := b.buf[b.r]
-	b.r++
-	return c, nil
-}
-
-func (b *bufreader) unreadByte() {
-	b.r--
-}
-
-func (b *bufreader) unreadBytes(n int) {
-	b.r -= n
-}
-
-func (b *bufreader) readUint64() (uint64, int, error) {
-	if b.r+8 > b.w {
-		_ = b.read0()
-	}
-	n := b.w - b.r
-	if n > 8 {
-		n = 8
-	}
-	u := binary.LittleEndian.Uint64(b.buf[b.r : b.r+8])
-	b.r += n
-	return u, n, nil
-}
-
-func (b *bufreader) reset(r io.Reader) {
-	b.rd = r
-	b.r = 0
-	b.w = 0
-}
-
-func (b *bufreader) discardBuffer() {
-	b.r = b.w
-}
-
-func (b *bufreader) discard(n int) (int, error) {
-	for b.r+n > b.w {
-		err := b.read0()
-		if err != nil {
-			return 0, err
-		}
-	}
-	b.r += n
-	return n, nil
-}
diff --git a/decoder.go b/decoder.go
index 8e0a02f..a0f3ad6 100644
--- a/decoder.go
+++ b/decoder.go
@@ -22,14 +22,16 @@ type Decoder interface {
 }
 
 type decoder struct {
+	rb                  [2048]byte
 	bbOffset            [256]int32
 	numAttributes       [256]byte
 	lastOpen            Name
 	preserveWhitespaces [32]bool
-	r                   bufreader
+	rd                  io.Reader
 	bb                  []byte
 	attrs               []Attr
-	buf                 [8]byte
+	r                   int
+	w                   int
 	read                byte
 	write               byte
 	top                 byte
@@ -38,9 +40,7 @@ type decoder struct {
 // NewDecoder creates a new Decoder.
 func NewDecoder(r io.Reader) Decoder {
 	return &decoder{
-		r: bufreader{
-			rd: r,
-		},
+		rd:    r,
 		bb:    make([]byte, 0, 256),
 		attrs: make([]Attr, 0, 256),
 	}
@@ -50,8 +50,55 @@ func isWhitespace(b byte) bool {
 	return b == '\t' || b == '\n' || b == '\r' || b == ' '
 }
 
+func (thiz *decoder) read0() error {
+	if thiz.r > 0 {
+		copy(thiz.rb[:], thiz.rb[thiz.r:thiz.w])
+		thiz.w -= thiz.r
+		thiz.r = 0
+	}
+	n, err := thiz.rd.Read(thiz.rb[thiz.w:])
+	thiz.w += n
+	if n <= 0 && err != nil {
+		return err
+	}
+	return nil
+}
+
+func (thiz *decoder) unreadByte() {
+	thiz.r--
+}
+
+func (thiz *decoder) readByte() (byte, error) {
+	for thiz.r == thiz.w {
+		err := thiz.read0()
+		if err != nil {
+			return 0, err
+		}
+	}
+	c := thiz.rb[thiz.r]
+	thiz.r++
+	return c, nil
+}
+
+func (thiz *decoder) discardBuffer() {
+	thiz.r = thiz.w
+}
+
+func (thiz *decoder) discard(n int) (int, error) {
+	for thiz.r+n > thiz.w {
+		err := thiz.read0()
+		if err != nil {
+			return 0, err
+		}
+	}
+	thiz.r += n
+	return n, nil
+}
+
 func (thiz *decoder) Reset(r io.Reader) {
-	thiz.r.reset(r)
+	thiz.rd = r
+	thiz.r = 0
+	thiz.w = 0
 	thiz.attrs = thiz.attrs[:0]
 	thiz.bb = thiz.bb[:0]
 	thiz.top = 0
@@ -63,7 +110,7 @@ func (thiz *decoder) skipWhitespaces(b byte) (byte, error) {
 			return b, nil
 		}
 		var err error
-		b, err = thiz.r.readByte()
+		b, err = thiz.readByte()
 		if err != nil {
 			return 0, err
 		}
@@ -73,7 +120,7 @@ func (thiz *decoder) skipWhitespaces(b byte) (byte, error) {
 func (thiz *decoder) NextToken(t *Token) error {
 	for {
 		// read next character
-		b, err := thiz.r.readByte()
+		b, err := thiz.readByte()
 		if err != nil {
 			return err
 		}
@@ -87,24 +134,24 @@ func (thiz *decoder) NextToken(t *Token) error {
 			// Immediately closing last openend StartElement.
 			// This will generate an EndElement with the same
 			// name that we used in the previous StartElement.
-			_, err = thiz.r.discard(1)
+			_, err = thiz.discard(1)
 			if err != nil {
 				return err
 			}
 			return thiz.decodeEndElement(t, thiz.lastOpen)
 		case '<':
-			b, err = thiz.r.readByte()
+			b, err = thiz.readByte()
 			if err != nil {
 				return err
 			}
 			switch b {
 			case '?':
 				err = thiz.decodeProcInst(t)
-				thiz.r.unreadByte()
+				thiz.unreadByte()
 				return err
 			case '!':
 				// CDATA or comment
-				b, err = thiz.r.readByte()
+				b, err = thiz.readByte()
 				if err != nil {
 					return err
 				}
@@ -130,7 +177,7 @@ func (thiz *decoder) NextToken(t *Token) error {
 				return thiz.decodeStartElement(t)
 			}
 		default:
-			thiz.r.unreadByte()
+			thiz.unreadByte()
 			cntn, err := thiz.decodeText(t)
 			if err != nil || !cntn {
 				return err
@@ -154,7 +201,7 @@ func (thiz *decoder) decodeProcInst(t *Token) error {
 		if b == '?' {
 			for {
 				var b2 byte
-				b2, err = thiz.r.readByte()
+				b2, err = thiz.readByte()
 				if err != nil {
 					return err
 				}
@@ -181,7 +228,7 @@ func (thiz *decoder) decodeProcInst(t *Token) error {
 				j = len(thiz.bb)
 			}
 		}
-		b, err = thiz.r.readByte()
+		b, err = thiz.readByte()
 		if err != nil {
 			return err
 		}
@@ -189,26 +236,26 @@ func (thiz *decoder) decodeProcInst(t *Token) error {
 }
 
 func (thiz *decoder) ignoreComment() error {
-	_, err := thiz.r.discard(1)
+	_, err := thiz.discard(1)
 	if err != nil {
 		return err
 	}
 	for {
 		var b byte
-		b, err = thiz.r.readByte()
+		b, err = thiz.readByte()
 		if err != nil {
 			return err
 		}
 		if b == '-' {
 			var b2 byte
-			b2, err = thiz.r.readByte()
+			b2, err = thiz.readByte()
 			if err != nil {
 				return err
 			}
 			if b2 == '-' {
 				for {
 					var b3 byte
-					b3, err = thiz.r.readByte()
+					b3, err = thiz.readByte()
 					if err != nil {
 						return err
 					}
@@ -238,7 +285,7 @@ func (thiz *decoder) decodeStartElement(t *Token) error {
 	thiz.numAttributes[thiz.top] = 0
 	thiz.bbOffset[thiz.top] = int32(len(thiz.bb))
 	thiz.preserveWhitespaces[thiz.top+1] = thiz.preserveWhitespaces[thiz.top]
-	thiz.r.unreadByte()
+	thiz.unreadByte()
 	name, b, err := thiz.readName()
 	if err != nil {
 		return err
@@ -252,7 +299,7 @@ func (thiz *decoder) decodeStartElement(t *Token) error {
 	t.Kind = TokenTypeStartElement
 	t.Name = name
 	t.Attr = attributes
-	thiz.r.unreadByte()
+	thiz.unreadByte()
 	return nil
 }
 
@@ -260,27 +307,36 @@ func (thiz *decoder) decodeText(t *Token) (bool, error) {
 	i := len(thiz.bb)
 	onlyWhitespaces := true
 	for {
-		b, err := thiz.r.readByte()
+		j := thiz.r
+		for k := j; k < thiz.w; k++ {
+			b := thiz.rb[k]
+			if b == '<' {
+				_, err := thiz.discard(k - j)
+				if err != nil {
+					return false, err
+				}
+				if onlyWhitespaces && !thiz.preserveWhitespaces[thiz.top] {
+					return true, nil
+				}
+				thiz.bb = append(thiz.bb, thiz.rb[j:k]...)
+				t.Kind = TokenTypeTextElement
+				t.ByteData = thiz.bb[i:len(thiz.bb)]
+				return false, nil
+			}
+			onlyWhitespaces = onlyWhitespaces && isWhitespace(b)
+		}
+		thiz.bb = append(thiz.bb, thiz.rb[j:thiz.w]...)
+		thiz.discardBuffer()
+		err := thiz.read0()
 		if err != nil {
 			return false, err
 		}
-		if b == '<' {
-			thiz.r.unreadByte()
-			if onlyWhitespaces && !thiz.preserveWhitespaces[thiz.top] {
-				return true, nil
-			}
-			t.Kind = TokenTypeTextElement
-			t.ByteData = thiz.bb[i:len(thiz.bb)]
-			return false, nil
-		}
-		onlyWhitespaces = onlyWhitespaces && isWhitespace(b)
-		thiz.bb = append(thiz.bb, b)
 	}
 }
 
 func (thiz *decoder) readCDATA() error {
 	// discard "CDATA["
-	_, err := thiz.r.discard(6)
+	_, err := thiz.discard(6)
 	if err != nil {
 		return err
 	}
@@ -330,20 +386,20 @@ func isSeparator(b byte) bool {
 func (thiz *decoder) readSimpleName() ([]byte, byte, error) {
 	i := len(thiz.bb)
 	for {
-		j := thiz.r.r
-		for k := j; k < thiz.r.w; k++ {
-			if isSeparator(thiz.r.buf[k]) {
-				thiz.bb = append(thiz.bb, thiz.r.buf[j:k]...)
-				_, err := thiz.r.discard(k - j + 1)
+		j := thiz.r
+		for k := j; k < thiz.w; k++ {
+			if isSeparator(thiz.rb[k]) {
+				thiz.bb = append(thiz.bb, thiz.rb[j:k]...)
+				_, err := thiz.discard(k - j + 1)
 				if err != nil {
 					return nil, 0, err
 				}
-				return thiz.bb[i:len(thiz.bb)], thiz.r.buf[k], nil
+				return thiz.bb[i:len(thiz.bb)], thiz.rb[k], nil
 			}
 		}
-		thiz.bb = append(thiz.bb, thiz.r.buf[thiz.r.r:thiz.r.w]...)
-		thiz.r.discardBuffer()
-		err := thiz.r.read0()
+		thiz.bb = append(thiz.bb, thiz.rb[j:thiz.w]...)
+		thiz.discardBuffer()
+		err := thiz.read0()
 		if err != nil {
 			return nil, 0, err
 		}
@@ -368,7 +424,7 @@ func (thiz *decoder) decodeAttributes(b byte) ([]Attr, error) {
 			if err != nil {
 				return nil, err
 			}
-			b, err = thiz.r.readByte()
+			b, err = thiz.readByte()
 			thiz.numAttributes[thiz.top]++
 		}
 	}
@@ -379,7 +435,7 @@ func (thiz *decoder) decodeAttributes(b byte) ([]Attr, error) {
 // is the byte after the closing single or double quote
 // of the attribute's value.
 func (thiz *decoder) decodeAttribute(attr *Attr) error {
-	thiz.r.unreadByte()
+	thiz.unreadByte()
 	name, b, err := thiz.readName()
 	if err != nil {
 		return err
@@ -391,7 +447,7 @@ func (thiz *decoder) decodeAttribute(attr *Attr) error {
 	if b != '=' {
 		return fmt.Errorf("expected '=' character following attribute %+v", name)
 	}
-	b, err = thiz.r.readByte()
+	b, err = thiz.readByte()
 	if err != nil {
 		return err
 	}
@@ -418,19 +474,19 @@ func (thiz *decoder) readString(b byte) ([]byte, bool, error) {
 	i := len(thiz.bb)
 	singleQuote := b == '\''
 	for {
-		j := thiz.r.r
-		k := bytes.IndexByte(thiz.r.buf[j:thiz.r.w], b)
+		j := thiz.r
+		k := bytes.IndexByte(thiz.rb[j:thiz.w], b)
 		if k > -1 {
-			thiz.bb = append(thiz.bb, thiz.r.buf[j:j+k]...)
-			_, err := thiz.r.discard(k + 1)
+			thiz.bb = append(thiz.bb, thiz.rb[j:j+k]...)
+			_, err := thiz.discard(k + 1)
 			if err != nil {
 				return nil, false, err
 			}
 			return thiz.bb[i:len(thiz.bb)], singleQuote, nil
 		}
-		thiz.bb = append(thiz.bb, thiz.r.buf[thiz.r.r:thiz.r.w]...)
-		thiz.r.discardBuffer()
-		err := thiz.r.read0()
+		thiz.bb = append(thiz.bb, thiz.rb[j:thiz.w]...)
+		thiz.discardBuffer()
+		err := thiz.read0()
 		if err != nil {
 			return nil, false, err
 		}