diff --git a/chacha20/chacha_mipsle.go b/chacha20/chacha_mipsle.go
new file mode 100644
index 0000000000..7057171607
--- /dev/null
+++ b/chacha20/chacha_mipsle.go
@@ -0,0 +1,16 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+package chacha20
+
+const bufSize = blockSize
+
+//go:noescape
+func xorKeyStream(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
+
+func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
+	xorKeyStream(dst, src, &s.key, &s.nonce, &s.counter)
+}
diff --git a/chacha20/chacha_mipsle.s b/chacha20/chacha_mipsle.s
new file mode 100644
index 0000000000..16093621f4
--- /dev/null
+++ b/chacha20/chacha_mipsle.s
@@ -0,0 +1,185 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ported from https://github.com/torvalds/linux/blob/1b294a1f35616977caddaddf3e9d28e576a1adbc/arch/mips/crypto/chacha-core.S
+// which is licensed under:
+// # ====================================================================
+// # SPDX-License-Identifier: GPL-2.0 OR MIT
+// #
+// # Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+// # Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+// # ====================================================================
+
+//go:build gc && !purego
+
+#include "textflag.h"
+
+#define X0	R1
+#define X1	R2
+#define X2	R3
+#define X3	R4
+#define X4	R5
+#define X5	R6
+#define X6	R7
+#define X7	R8
+#define X8	R9
+#define X9	R10
+#define X10	R11
+#define X11	R12
+#define X12	R13
+#define X13	R14
+#define X14	R15
+#define X15	R16
+
+#define DST 		R17
+#define SRC 		R18
+#define SRC_LEN 	R19
+#define KEY			R20
+#define NONCE 		R21
+#define CTR			R22
+
+#define LOOP_I		R24
+#define TMP			R25
+
+#ifdef GOMIPS_r2
+#define hasROTR
+#endif
+#ifdef GOMIPS_r5
+#define hasROTR
+#endif
+
+#ifdef hasROTR
+#define ROTL(S, R) 			\
+	ROTR	$(32-S), R
+#else
+#define ROTL(S, R) 			\
+	SLL		$(S), R, TMP	\
+	SRL		$(32-S), R 		\
+	OR 		TMP, R
+#endif
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	ADDU	K, A 	\
+	ADDU	L, B 	\
+	ADDU	M, C 	\
+	ADDU	N, D 	\
+	XOR		A, V 	\
+	XOR		B, W 	\
+	XOR		C, Y 	\
+	XOR		D, Z 	\
+	ROTL	(S, V) 	\
+	ROTL	(S, W) 	\
+	ROTL	(S, Y) 	\
+	ROTL	(S, Z)
+
+#define FOR_STATE(OP, OP_MEM) \
+	OP (    $0x61707865,	X0 ) 	\ // expa
+	OP (    $0x3320646e,	X1 ) 	\ // nd 3
+	OP (    $0x79622d32, 	X2 ) 	\ // 2-by
+	OP (	$0x6b206574,	X3 ) 	\ // te k
+	OP_MEM (	0(KEY), 	X4 ) 	\
+	OP_MEM (    4(KEY), 	X5 ) 	\
+	OP_MEM (    8(KEY), 	X6 ) 	\
+	OP_MEM (    12(KEY), 	X7 ) 	\
+	OP_MEM (    16(KEY), 	X8 ) 	\
+	OP_MEM (    20(KEY), 	X9 ) 	\
+	OP_MEM (    24(KEY),	X10 ) 	\
+	OP_MEM (    28(KEY),	X11 ) 	\
+	OP (    	CTR,		X12 ) 	\
+	OP_MEM (    0(NONCE),	X13 ) 	\
+	OP_MEM (    4(NONCE),	X14 ) 	\
+	OP_MEM (    8(NONCE),	X15 )
+
+#define movw(x, y) \
+	MOVW x, y
+
+#define ADD(V, REG)  \
+	ADDU	V, REG
+
+#define ADD_MEM(ADDR, REG)  \
+	MOVW    ADDR, TMP 		\
+	ADDU	TMP, REG
+
+// XOR_STREAM_WORD works with unaligned memory, this is quite important since the strams might not be aligned.
+// Especially during the use in TLS the memory is often unaligned.
+#define XOR_STREAM_WORD( OFF, REG) 	\
+	MOVWL	(4*OFF + 3)(SRC), TMP 	\
+	MOVWR	(4*OFF)(SRC), TMP 		\
+	XOR		REG, TMP				\ 
+	MOVWL	TMP, (4*OFF + 3)(DST) 	\
+	MOVWR	TMP, (4*OFF)(DST)
+
+// func xorKeyStream(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
+TEXT ·xorKeyStream(SB), NOSPLIT|NOFRAME, $0
+	MOVW	dst+0(FP), 		DST
+	MOVW	src+12(FP), 	SRC
+	MOVW	src_len+16(FP), SRC_LEN
+	MOVW	key+24(FP),		KEY
+	MOVW	nonce+28(FP),	NONCE
+	MOVW	counter+32(FP), CTR
+
+	// load counter
+	MOVW	(CTR), CTR
+
+chacha:
+
+	// load initial State into X*
+	FOR_STATE ( movw, movw )
+
+	// set number of rounds
+	MOVW	$20, LOOP_I
+
+loop:
+	AXR( X0,X1,X2,X3,		X4,X5,X6,X7,		X12,X13,X14,X15,	16)
+	AXR( X8,X9,X10,X11,		X12,X13,X14,X15,	X4,X5,X6,X7,		12)
+	AXR( X0,X1,X2,X3,		X4,X5,X6,X7,		X12,X13,X14,X15,	8)
+	AXR( X8,X9,X10,X11,		X12,X13,X14,X15,	X4,X5,X6,X7,  		7)
+	AXR( X0,X1,X2,X3,		X5,X6,X7,X4,		X15,X12,X13,X14,	16)
+	AXR( X10,X11,X8,X9,		X15,X12,X13,X14,	X5,X6,X7,X4, 		12)
+	AXR( X0,X1,X2,X3,		X5,X6,X7,X4,		X15,X12,X13,X14,	8)
+	AXR( X10,X11,X8,X9,		X15,X12,X13,X14,	X5,X6,X7,X4,  		7)
+
+	ADDU	$-2, LOOP_I
+	BNE     LOOP_I, loop
+
+	// add back the initial state to generate the key stream
+	FOR_STATE ( ADD, ADD_MEM )
+
+	// xor the key stream with the source and write out the result
+	XOR_STREAM_WORD (0, X0)
+	XOR_STREAM_WORD (1, X1)
+	XOR_STREAM_WORD (2, X2)
+	XOR_STREAM_WORD (3, X3)
+	XOR_STREAM_WORD (4, X4)
+	XOR_STREAM_WORD (5, X5)
+	XOR_STREAM_WORD (6, X6)
+	XOR_STREAM_WORD (7, X7)
+	XOR_STREAM_WORD (8, X8)
+	XOR_STREAM_WORD (9, X9)
+	XOR_STREAM_WORD (10, X10)
+	XOR_STREAM_WORD (11, X11)
+	XOR_STREAM_WORD (12, X12)
+	XOR_STREAM_WORD (13, X13)
+	XOR_STREAM_WORD (14, X14)
+	XOR_STREAM_WORD (15, X15)
+
+	// decrement length
+	ADDU	$-64, SRC_LEN, SRC_LEN
+
+	// increment pointers
+	MOVW 	$64(DST), DST
+	MOVW	$64(SRC), SRC
+
+	// increment counter
+	ADDU	$1, CTR
+
+	// loop if there's still data
+	BNE     SRC_LEN, chacha
+
+	// store Counter
+	MOVW	counter+32(FP), TMP
+	MOVW	CTR, (TMP)
+	
+	RET
+
diff --git a/chacha20/chacha_noasm.go b/chacha20/chacha_noasm.go
index db42e6676a..f305ac3c5b 100644
--- a/chacha20/chacha_noasm.go
+++ b/chacha20/chacha_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
+//go:build (!arm64 && !s390x && !ppc64le && !mipsle) || !gc || purego
 
 package chacha20
 
diff --git a/chacha20poly1305/chacha20poly1305_test.go b/chacha20poly1305/chacha20poly1305_test.go
index 82a4a36102..2e70c877ba 100644
--- a/chacha20poly1305/chacha20poly1305_test.go
+++ b/chacha20poly1305/chacha20poly1305_test.go
@@ -202,7 +202,7 @@ func benchamarkChaCha20Poly1305Open(b *testing.B, buf []byte, nonceSize int) {
 }
 
 func BenchmarkChacha20Poly1305(b *testing.B) {
-	for _, length := range []int{64, 1350, 8 * 1024} {
+	for _, length := range []int{64, 1024, 1350, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024} {
 		b.Run("Open-"+strconv.Itoa(length), func(b *testing.B) {
 			benchamarkChaCha20Poly1305Open(b, make([]byte, length), NonceSize)
 		})
diff --git a/internal/poly1305/mac_noasm.go b/internal/poly1305/mac_noasm.go
index 333da285b3..ba8805e3f6 100644
--- a/internal/poly1305/mac_noasm.go
+++ b/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
+//go:build (!amd64 && !ppc64le && !s390x && !mipsle) || !gc || purego
 
 package poly1305
 
diff --git a/internal/poly1305/sum_mipsle.go b/internal/poly1305/sum_mipsle.go
new file mode 100644
index 0000000000..6501ddb44b
--- /dev/null
+++ b/internal/poly1305/sum_mipsle.go
@@ -0,0 +1,53 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build gc && !purego
+
+package poly1305
+
+// mac is a wrapper for macGeneric that redirects calls that would have gone to
+// updateGeneric to update.
+//
+// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
+// using function pointers would carry a major performance cost.
+type mac struct{ macGeneric }
+
+func (h *mac) Write(p []byte) (int, error) {
+	nn := len(p)
+	if h.offset > 0 {
+		n := copy(h.buffer[h.offset:], p)
+		if h.offset+n < TagSize {
+			h.offset += n
+			return nn, nil
+		}
+		p = p[n:]
+		h.offset = 0
+		update(&h.macState, h.buffer[:], 1)
+	}
+	if n := len(p) - (len(p) % TagSize); n > 0 {
+		update(&h.macState, p[:n], 1)
+		p = p[n:]
+	}
+	if len(p) > 0 {
+		h.offset += copy(h.buffer[h.offset:], p)
+	}
+	return nn, nil
+}
+
+func (h *mac) Sum(out *[16]byte) {
+	state := h.macState
+	if n := h.offset; n > 0 {
+		h.buffer[n] = 1
+		n++
+		for ; n < TagSize; n++ {
+			h.buffer[n] = 0
+		}
+
+		update(&state, h.buffer[:], 0)
+	}
+	finalize(out, &state.h, &state.s)
+}
+
+//go:noescape
+func update(state *macState, msg []byte, padbit uint32)
diff --git a/internal/poly1305/sum_mipsle.s b/internal/poly1305/sum_mipsle.s
new file mode 100644
index 0000000000..d829c65f6d
--- /dev/null
+++ b/internal/poly1305/sum_mipsle.s
@@ -0,0 +1,230 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ported from https://github.com/WireGuard/wireguard-monolithic-historical/blob/edad0d6e99e5133b1e8e865d727a25fff6399cb4/src/crypto/zinc/poly1305/poly1305-mips.S
+// which is licensed under:
+// # ====================================================================
+// # SPDX-License-Identifier: GPL-2.0 OR MIT
+// #
+// # Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+// # Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+// # ====================================================================
+
+//go:build gc && !purego
+
+#include "textflag.h"
+
+#define MADDU(rs, rt) \
+	WORD $(0x70000001 + (rs << 21) + (rt << 16));
+
+#define ADDU_C(CA, D, H)\
+	ADDU	CA, H 		\
+	SGTU	CA, H, CA  	\
+	ADDU	D, H  		\
+	SGTU	D, H, D  	\
+	ADDU	D, CA
+
+#define ADDU_CA(CA, H) 	\
+	ADDU	CA, H 		\
+	SGTU	CA, H, CA
+
+#define PTR_POLY1305_H(n) (n*4)(STATE)
+#define PTR_POLY1305_R(n) ( 24 + (n*4))(STATE)
+
+#define P_H0 	R1
+#define P_H0_n 	1
+#define P_H1 	R2
+#define P_H1_n 	2
+#define P_H2	R3
+#define P_H2_n	3
+#define P_H3 	R4
+#define P_H3_n 	4
+#define P_H4 	R5
+#define P_H4_n 	5
+
+#define P_R0 	R6
+#define P_R0_n 	6
+#define P_R1 	R7
+#define P_R1_n 	7
+#define P_R2 	R8
+#define P_R2_n 	8
+#define P_R3 	R9
+#define P_R3_n 	9
+
+#define P_S1 	R10
+#define P_S1_n 	10
+#define P_S2 	R11
+#define P_S2_n 	11
+#define P_S3 	R12
+#define P_S3_n 	12
+
+#define STATE	R13
+#define MSG		R14
+#define MSG_LEN	R15
+
+#define D_0 	R16
+#define D_1 	R17
+#define D_2 	R18
+#define D_3 	R19
+
+#define CA		R20
+#define CA_n	20
+#define SC		R21
+#define SC_n	21
+
+#define TMP		R22
+
+#define MSB 3
+#define LSB 0
+
+// func update(state *[7]uint64, msg []byte, padbit uint32)
+TEXT ·update(SB), NOSPLIT|NOFRAME, $0
+	MOVW state+0(FP), STATE
+	MOVW D_base+4(FP), MSG
+	MOVW D_len+8(FP), MSG_LEN
+
+	/* load Rx */
+	MOVW	PTR_POLY1305_R(0), P_R0
+	MOVW	PTR_POLY1305_R(1), P_R1
+	MOVW	PTR_POLY1305_R(2), P_R2
+	MOVW	PTR_POLY1305_R(3), P_R3
+
+	/* load Hx */
+	MOVW	PTR_POLY1305_H(0), P_H0
+	MOVW	PTR_POLY1305_H(1), P_H1
+	MOVW	PTR_POLY1305_H(2), P_H2
+	MOVW	PTR_POLY1305_H(3), P_H3
+	MOVW	PTR_POLY1305_H(4), P_H4
+
+	/* Sx = Rx + (Rx >> 2) */
+	SRL		$2, P_R1, P_S1
+	SRL		$2, P_R2, P_S2
+	SRL		$2, P_R3, P_S3
+	ADDU	P_R1, P_S1
+	ADDU	P_R2, P_S2
+	ADDU	P_R3, P_S3
+
+	MOVW	$1, SC
+
+	// The following code up to loop is needed to fix some tests:
+	// - Override initial state to ensure large h (subject to h < 2(2¹³⁰ - 5)) is deserialized from the state correctly.
+	//
+	// For those tests we need to calc the modulus before starting, normally we do that at the end but there are tests
+	// that come with a larger h.
+	/* c = (h4 >> 2) + (h4 & ~3U); */
+	SRL		$2, P_H4, CA
+	SLL		$2, CA, TMP
+	ADDU	TMP, CA
+	/* h4 &= 3 */
+	AND		$3,	P_H4
+
+loop:
+	MOVWL	 0+MSB(MSG), D_0
+	MOVWL	 4+MSB(MSG), D_1
+	MOVWL	 8+MSB(MSG), D_2
+	MOVWL	12+MSB(MSG), D_3
+	MOVWR	 0+LSB(MSG), D_0
+	MOVWR	 4+LSB(MSG), D_1
+	MOVWR	 8+LSB(MSG), D_2
+	MOVWR	12+LSB(MSG), D_3
+
+	/* h0 = (u32)(d0 = (u64)h0 + inp[0]); */
+	ADDU_C	(CA, D_0, P_H0)
+	
+	/* h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + inp[4]); */
+	ADDU_C	(CA, D_1, P_H1)
+
+	/* h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + inp[8]); */
+	ADDU_C	(CA, D_2, P_H2)
+
+	/* h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + inp[12]); */
+	ADDU_C	(CA, D_3, P_H3)
+
+	/* h4 += (u32)(d3 >> 32) + padbit; */
+	MOVW	padbit+16(FP), TMP
+	ADDU	TMP, P_H4
+	ADDU	CA, P_H4
+
+	/* D0 */
+	MULU	P_H0, P_R0
+	MADDU	(P_H1_n, P_S3_n)
+	MADDU	(P_H2_n, P_S2_n)
+	MADDU	(P_H3_n, P_S1_n)
+	MOVW	HI,	CA
+	MOVW	LO, D_0
+
+	/* D1 */
+	MULU	P_H0, P_R1
+	MADDU	(P_H1_n, P_R0_n)
+	MADDU	(P_H2_n, P_S3_n)
+	MADDU	(P_H3_n, P_S2_n)
+	MADDU	(P_H4_n, P_S1_n)
+	MADDU	(CA_n, SC_n)
+	MOVW	HI, CA
+	MOVW	LO, D_1
+
+	/* D2 */
+	MULU	P_H0, P_R2
+	MADDU	(P_H1_n, P_R1_n)
+	MADDU	(P_H2_n, P_R0_n)
+	MADDU	(P_H3_n, P_S3_n)
+	MADDU	(P_H4_n, P_S2_n)
+	MADDU	(CA_n, SC_n)
+	MOVW	HI, CA
+	MOVW	LO, D_2
+
+	/* D3 */
+	MULU	P_H0, P_R3
+	MADDU	(P_H1_n, P_R2_n)
+	MADDU	(P_H2_n, P_R1_n)
+	MADDU	(P_H3_n, P_R0_n)
+	MADDU	(P_H4_n, P_S3_n)
+	MADDU	(CA_n, SC_n)
+	MOVW	HI, CA
+	MOVW	LO, D_3
+
+	/* D4 */
+	MULU	P_H4, P_R0
+	MADDU	(CA_n, SC_n)
+	MOVW	LO, P_H4
+
+	MOVW	D_0, P_H0
+	MOVW	D_1, P_H1
+	MOVW	D_2, P_H2
+	MOVW	D_3, P_H3
+	/* P_H4 has been directly assigned in D4 step */
+	
+	/* c = (h4 >> 2) + (h4 & ~3U); */
+	SRL		$2, P_H4, CA
+	SLL		$2, CA, TMP
+	ADDU	TMP, CA
+
+	/* h4 &= 3 */
+	AND		$3,	P_H4
+
+	/* decrement length */
+	ADDU	$-16, MSG_LEN, MSG_LEN
+
+	/* increment pointers */
+	MOVW 	$16(MSG), MSG
+
+	/* able to do a 16 byte block. */
+	BNE     MSG_LEN, loop
+
+	/* h += c; */
+	ADDU_CA	(CA, P_H0)
+	ADDU_CA	(CA, P_H1)
+	ADDU_CA	(CA, P_H2)
+	ADDU_CA	(CA, P_H3)
+	ADDU_CA	(CA, P_H4)
+
+	/* store Hx */
+	MOVW	P_H0, PTR_POLY1305_H(0)
+	MOVW	P_H1, PTR_POLY1305_H(1)
+	MOVW	P_H2, PTR_POLY1305_H(2)
+	MOVW	P_H3, PTR_POLY1305_H(3)
+	MOVW	P_H4, PTR_POLY1305_H(4)
+	MOVW	$0, PTR_POLY1305_H(5)
+
+	RET