From 2850721002cc36a0627ca690c68918f1fd6070cd Mon Sep 17 00:00:00 2001 From: Christian Zaefferer Date: Mon, 18 Feb 2019 11:43:19 +0100 Subject: [PATCH 1/2] fix: fixes #70. unescaped quotes inside of cells --- index.js | 16 +++++++-- test/data/unescaped_quotes.csv | 9 +++++ test/maxRowBytes.test.js | 4 +-- test/snapshots/test.js.md | 61 ++++++++++++++++++++++++++++++++- test/snapshots/test.js.snap | Bin 1702 -> 1821 bytes test/test.js | 18 ++++++++++ 6 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 test/data/unescaped_quotes.csv diff --git a/index.js b/index.js index 367e6f0..b86802f 100644 --- a/index.js +++ b/index.js @@ -153,7 +153,7 @@ class CsvParser extends Transform { } for (let i = start; i < end; i++) { - const isStartingQuote = !isQuoted && buf[i] === this.quote + const isStartingQuote = !isQuoted && buf[i] === this.quote && (i === start || buf[i - 1] === comma) const isEndingQuote = isQuoted && buf[i] === this.quote && i + 1 <= end && buf[i + 1] === comma const isEscape = isQuoted && buf[i] === this.escape && i + 1 < end && buf[i + 1] === this.quote @@ -222,6 +222,7 @@ class CsvParser extends Transform { const bufLen = buf.length for (let i = start; i < bufLen; i++) { + const prevChr = i > 0 ? buf[i - 1] : null const chr = buf[i] const nextChr = i + 1 < bufLen ? buf[i + 1] : null @@ -237,10 +238,19 @@ class CsvParser extends Transform { if (this._escaped) { this._escaped = false // non-escaped quote (quoting the cell) + continue } else { - this._quoted = !this._quoted + // not in escape- or quote-mode, currently at start or previous char was separator or linebreak -> enter quote mode + if (!this._quoted && (prevChr === null || prevChr === this.separator || prevChr === nl || prevChr === this.newline)) { + this._quoted = true + continue + } + // in quote-mode but not escape-mode, next char is separator or linebreak -> leave quote mode + if (this._quoted && (nextChr === this.separator || nextChr === nl || nextChr === this.newline)) { + this._quoted = false + continue + } } - continue } if (!this._quoted) { diff --git a/test/data/unescaped_quotes.csv b/test/data/unescaped_quotes.csv new file mode 100644 index 0000000..15fe291 --- /dev/null +++ b/test/data/unescaped_quotes.csv @@ -0,0 +1,9 @@ +a,b,c +jo"e,sam,jan +"jo"e",sam,jan +joe,sa"m,jan +joe,"sa"m",jan +joe,sam,ja"n +joe,sam, "ja"n", +joe,"sa +"m", jan \ No newline at end of file diff --git a/test/maxRowBytes.test.js b/test/maxRowBytes.test.js index b5f3e72..b5a6435 100644 --- a/test/maxRowBytes.test.js +++ b/test/maxRowBytes.test.js @@ -5,9 +5,9 @@ const { collect } = require('./helpers/helper') test.cb('optional row size limit', (t) => { const verify = (err, lines) => { t.is(err.message, 'Row exceeds the maximum size', 'strict row size') - t.is(lines.length, 4576, '4576 rows before error') + t.is(lines.length, 13, '13 rows before error') t.end() } - collect('max_row_size.csv', { maxRowBytes: 200 }, verify) + collect('max_row_size.csv', { maxRowBytes: 170 }, verify) }) diff --git a/test/snapshots/test.js.md b/test/snapshots/test.js.md index 6494cf8..ee46d9c 100644 --- a/test/snapshots/test.js.md +++ b/test/snapshots/test.js.md @@ -504,4 +504,63 @@ Generated by [AVA](https://ava.li). > Snapshot 1 - [] \ No newline at end of file + [ + +## cell with unescaped quotes + +> first row + + Row { + a: 'jo"e', + b: 'sam', + c: 'jan', + } + +> second row + + Row { + a: 'jo"e', + b: 'sam', + c: 'jan', + } + +> third row + + Row { + a: 'joe', + b: 'sa"m', + c: 'jan', + } + +> fourth row + + Row { + a: 'joe', + b: 'sa"m', + c: 'jan', + } + +> fifth row + + Row { + a: 'joe', + b: 'sam', + c: 'ja"n', + } + +> sixth row + + Row { + a: 'joe', + b: 'sam', + c: ' "ja"n"', + } + +> seventh row + + Row { + a: 'joe', + b: `sa␊ + "m`, + c: ' jan', + } \ No newline at end of file diff --git a/test/snapshots/test.js.snap b/test/snapshots/test.js.snap index 82bfbf692939a135f4ac6f9c3627a50860550c15..5fe93b74450b5562d5e49d7f04243cc248d8cdcb 100644 GIT binary patch literal 1821 zcmV+&2jciaRzV|IgCNucq0QGK4pamitBNYS&6%_=1Ec!CEMWAfOZHSOT%Hlr@0|}VN&9=L z1nUKfVraO?8v9~S`j4|Vsq&_7?{us@M6hmAiCiZlX&MnnkjMnB$oOE15Ru|gk%Z17 zHzYEZ?(Uw8xfj${tcZ<7`%4C=smrT}8f= zNbQCYKRWwDde7H(e0A1Qy5^CGxt9`~^%ow|>={S>wJP``O8hPo&hCRDyL$ z50M@bp!tL+KmI81R$1SQwU&x)-)|&>b+1HnPmxo=p~8(TgA$J{s!ScIW+z)Wbn0ED0d~x`(Y<~OngIKUe_Ce^fX6fcjA#Z)X`dnV|stu6~uwb3n7omQu zO1CUdjjT@2|KjhVyWZGL1gpBAh*RPs@M_N}W?c7y>)u;2xn%Y9t5dLGo%A?DrmiCo zy?v(QLS$WSOp0?)QUHI<;+MDe>& zn^#|AVHWIw>K>p5_!%`X95lZJO*Eh{1td1onfPl4Be^Mql)Chp`R;i*RDK=~d#Gwh zR@LcJl~h7?J8)Q0QBns@t^hh^HC3(2vbq9>q6-+BDqxg4@ewJvNs(Hc0!bm1Hei{8 zDouDvX0cA$nEEIo>%Ct1y$gC0cDMyu6sGINZB<-tBZF4hneJBW&n ziO-qmxy%fQ@+lw%$Y{as5#jc7T=8Ej#UHAl)K^tPua}|vI}if#9iWi!p?MNCNdfG@ zGKDDt&9_05oxqpCg+|TQ?XylRbgNI}j4V}RvvQP)KE3lIH!{xgjA>ZxD z4jXO0mC2e4u9tk9{+og`hStOfuR9P{h^Thbwcjf-SgzH=Ad){ zD;Jv!UCoQ|ZG6Y7V=BBK?Gx2iHyb>UZ zx(O1|EN^6Ntch;5SbTuQf<)c2LoYvu&gXe@hb}?hnc?>)bkS1iqB~ybqN5*%?qTTK zFLbT_8PRiX_vgLMk)H|fV_=`%L|K!&iTm_DIo2I$?@MuR`Mob0+<{i=)<@Ymd1Vbv zD(edK3jo*ZDNN+174hBJ8l?-Qsgv6@Ah!iUh8{xddqtiS3l@w$r4>ycgEuwkb3o2r z4@ywu5d{rA$Afzh;Nhh5FMXu`_oy&n?A;$u2H9qsAVD{L?DI^CC70ou5<6vNI4YN= zc$VcTj9x_4(T zH44YU1i5J+LjeoU1+7{kEaOF6v-~Lq>$n67oVoK z)hANu^RFRpm()B%KR7>alUcp_utSn1%u zaYjqBQe;fbw5&0;v78$JB$3+SO#hsmky5P5{;T7^e!N}oj)QYgyaOt3<}>fzZ-4vV z?|pyo-g~<@h(ahFq53Nk2WkWLjb#mC3ukT2#;E<13rzlK#h%)wtFz-qzww?ab^izj zv9?N-gX1!D{IdmF-46eKBIUs%$%93% zN#qPc=+PVW2?91jcuBDCbun^BG%%eBHu`4 zgdrqM&TGvY{?g9R&)BLqToB%-i1oBYQMkw-z?`#n>o$~5`+DlfC(l2W-fU11>xy9_ z!y-WQ84rK(e$kCZqiSC^)o%ZGGexX>CDMkAoB|Fl*t|Ai(vf8i8Do^(jEmbSVlBT{ zjq#ldSJ;jmk%YChaSrlcH|$##2OWe&_&hC`U^p?eX;&*QTe(}QA;qf){H`E^t!68 z%QK>y(uzO*GjaDT^%SuxM~m1c&I2zFk7lQZkJ=k{S@Ee2AL>b@M2))1wtyAnOgXv)RJdXhmXpru)Vu+4hCX!EX?$g%UX zFtPp(tli&O-xSe$gHu$tJeGO>5+>G>Q3xrJn#_5u#)>#7#{pqj9g3-Ja}~y+I3Tld z0jIZVwOYlkB?t#2j1j&Ed=%lBLRcA!RVAFsWOjV3GFwdsmu8j8?LQ~!XJwUdo^op* zvyNJV`=vrhP3L||@2DmECzx@bgGurnOtP!sW1&py2tt)=wX*j1FvMC&NC=4{kSMOf zcuL280hLfL2Hpft0KWlR>^c}lcY{&6g{5W<1}i5B9p8fzbn!IH+iAfT=jN)^8*O%n zUn+06nXC9%IycvSE)*H9g3WQ)baM&s-ja!RI#6k5&h0w2$Y^o27C$~g=bAn$5y}a` z6JDsMAgaw2?}b`Tdx^;`*a_7=Kr`?oy1j7VehXYwpf3$f>ZCK}s~L2Cy=-N13+hkls;P-prF_w)Ss!P5VebPNpk)6yRdOTUJ! z{hB^s{FOYEHeijUkIV+(*C&eq#31}}0=yu90;FKF1_IH*6#tw6?`p~E*)yKvc*FnT z@z;mJ6a!1|CKz9x4E&>%5B3usl;eTf@`2p#!QS`XdVZ8rFpTfJ^K$d?yQZCX%-&Qr z0J?=v`%KyXxya-Lm3I@2D({h}IQO?%|MG>UQ!v$jN|i=1mvJ;aS66gdmYq@Q{3Frr zrmW{+k~{_Tf}bP`BN8#3pl23^w4& zl~y#lblzR3PXYyZ+$n`R_eeBw?GMK8$Cs1x|MXGq|3`%YQ}6ucWPoM90T$?vuaS>e zmO!kx5``2K4W3{nfYA);q6%03(TQR3s(<0P9>)wg3PC diff --git a/test/test.js b/test/test.js index 563ce0c..974a2ec 100644 --- a/test/test.js +++ b/test/test.js @@ -145,6 +145,24 @@ test.cb('cell with newline', (t) => { }) }) +test.cb('cell with unescaped quotes', (t) => { + const verify = (err, lines) => { + // console.log(lines); + t.false(err, 'no err') + t.snapshot(lines[0], 'first row') + t.snapshot(lines[1], 'second row') + t.snapshot(lines[2], 'third row') + t.snapshot(lines[3], 'fourth row') + t.snapshot(lines[4], 'fifth row') + t.snapshot(lines[5], 'sixth row') + t.snapshot(lines[6], 'seventh row') + t.is(lines.length, 7, '7 rows') + t.end() + } + + collect('unescaped_quotes.csv', verify) +}) + test.cb('cell with escaped quote in quotes', (t) => { const headers = bops.from('a\n') const cell = bops.from('"ha ""ha"" ha"\n') From 4d400c9e05e40ceafa710fd9d5eeca64b8906a00 Mon Sep 17 00:00:00 2001 From: Christian Zaefferer Date: Mon, 18 Feb 2019 12:51:36 +0100 Subject: [PATCH 2/2] fix: handle crlf --- index.js | 3 ++- test/data/unescaped_quotes.csv | 6 ++++-- test/snapshots/test.js.md | 20 ++++++++++++++++++-- test/snapshots/test.js.snap | Bin 1821 -> 1822 bytes test/test.js | 4 +++- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/index.js b/index.js index b86802f..12433ff 100644 --- a/index.js +++ b/index.js @@ -225,6 +225,7 @@ class CsvParser extends Transform { const prevChr = i > 0 ? buf[i - 1] : null const chr = buf[i] const nextChr = i + 1 < bufLen ? buf[i + 1] : null + const nextNextChr = i + 2 < bufLen ? buf[i + 2] : null this._currentRowBytes++ if (this._currentRowBytes > this.maxRowBytes) { @@ -246,7 +247,7 @@ class CsvParser extends Transform { continue } // in quote-mode but not escape-mode, next char is separator or linebreak -> leave quote mode - if (this._quoted && (nextChr === this.separator || nextChr === nl || nextChr === this.newline)) { + if (this._quoted && (nextChr === this.separator || (this.customNewline ? nextChr === this.newline : nextChr === nl || (nextChr === cr && nextNextChr === nl)))) { this._quoted = false continue } diff --git a/test/data/unescaped_quotes.csv b/test/data/unescaped_quotes.csv index 15fe291..5c29715 100644 --- a/test/data/unescaped_quotes.csv +++ b/test/data/unescaped_quotes.csv @@ -4,6 +4,8 @@ jo"e,sam,jan joe,sa"m,jan joe,"sa"m",jan joe,sam,ja"n -joe,sam, "ja"n", +joe,sam,"ja"n" joe,"sa -"m", jan \ No newline at end of file +"m",jan +joe,crlf,"jan" +joe,sam,"ja"n" diff --git a/test/snapshots/test.js.md b/test/snapshots/test.js.md index ee46d9c..e029add 100644 --- a/test/snapshots/test.js.md +++ b/test/snapshots/test.js.md @@ -553,7 +553,7 @@ Generated by [AVA](https://ava.li). Row { a: 'joe', b: 'sam', - c: ' "ja"n"', + c: 'ja"n', } > seventh row @@ -562,5 +562,21 @@ Generated by [AVA](https://ava.li). a: 'joe', b: `sa␊ "m`, - c: ' jan', + c: 'jan', + } + +> eighth row + + Row { + a: 'joe', + b: 'crlf', + c: 'jan', + } + +> ninth row + + Row { + a: 'joe', + b: 'sam', + c: 'ja"n', } \ No newline at end of file diff --git a/test/snapshots/test.js.snap b/test/snapshots/test.js.snap index 5fe93b74450b5562d5e49d7f04243cc248d8cdcb..bd6addcd46c4b53c20e4368f344af24f7467442c 100644 GIT binary patch literal 1822 zcmV+(2jTcZRzVV6uo}CVIP|=tu~Q-~-E{a6l~)G(eH;vU`EWxN#_g;3Fh2^a~4 zf8Ral+_Pj8ga#s1aXsiznUAijpfa#z=B8AP8vfFONuRFVSGH_TYQ(U2J~YN39IOzm zbrOZpaJeb+xvZ2Q7i?AJ%-PxJ#A08;x=kT+gNP)nM66yS)4fHe`AGPR6#9w8_7%A) zk=75PqvyA$otj*uIaD)LRb6|m9~P_^fe$vRl4_UT+^P;MDm#$0cMcJ(W`B{s8j*4! z;q2x&_hnT3&N%Zq7yNu$6%nkO0Ff6Zegl-(LYI4=ShF=_O!d6;>Ggg@u;%s``A#A! z5Fvh2MqSFFS9X1M-deozGXFjitY;;128jFx%)U^*e&d3v-%k1L^rexBHF||$T{%!> zU=V0NebmPv<=kF0wCr_b*^ckG5W%`%B4Loo8Q@6%mUUin$Cg(njaIVLuWToRweVq) zyArDgBlPQ%8;Cxg5|b6Q?)ZxR|HKl(8vcmLA&JBx2pR8e?o)5`{vouy#3xPj=N&9q zHv*H;@TGIF9f>InIFZ5c%sh+*Ye+CcS5&JiF8jXq^@a;Mh3hwmEX9JgbSOf@))#ME zkrYy$ko(2o(R<#gAc9pnOvEN}33z!>C^L1y=uPjfomI3U^V)1ISZ6+pkiP$fBX6H8 zI~By%Y#Vp$;sGpJbAkBd+HVFYREDU^uZPANjOkdgo~9As1B;T%_A3X5hAZij0zJuERix2AqDalo+j*#6%kO51p={m04AxXVvGZ@S8{eftFvk}8pYkE z2>T(75q=2TCrwMWdBrbvKaMF-6~@n9#rXpwa9l*T)M}My%yzqDcfGt zx?a-RYtf!@W~_T>V%$3u>n!*fC}SF(P{ms;jIF^9vF7jZFL>c2D6YnMa^v^B7DBlM zco#SY`~hgN^UWw)x*3&=Ti%RbZ((`9@w0D~Hj-jE8^v2AT$(C%2CL29E}pYlO~qU| zm7eaJ3poZ0Z?%7GnkkQS?a9bE8ju7N>vEkeGMMeXMUIcsI){%$LpcF>stu|s1l4AU z--TLfLy3hf*ag*nKn?ISYF#*JeixdkKwk=oYoashtPDo(QU=L&*(~!Nc^K_J563=G zwI{1;v#Uxfp}G?|Dyt}|hbC76wY-{&_RO;CJcgq47@Epsk*4-tIf>P6DfM8$-r6ZaEjKMOcuR?W2`I>_ROeAZA4@^ z6&VqeRqDLVjDzxVAQ4Dy!|f5__Ok5pUnRvKs-HAe6+*99p!z%D3-KKxGv7n=cxaLc zSb)_sDFMxQK$G3Tmq1;U=E|;Fr)9d;By&2JDl(fmO3hGKLxJnk(Z&^6&iirhyB+Vo z+X+23{5-HmKKxbCWDjr>_^AhmKfn?GzCAqt-7xk4B5ecxJT&$D!PKu7=6-dzPyR{{ zN-MBd(nn?i@aP-GeQ^_>*a2Ra-vJUxRv#b~nCzJyV27rh_PygNPFDU8UVl5$nOtC5 zN1W-boq=bR>V)+~3*~rVmb@b!-Pm{juI1EUjpA8oZ@J zp9Zq-c~F9yjwop0JRaP;2M;IZf9WF)zeo84Q||q6(#t$w4-<6b$KWT6jM)s&6j>-8 z!%^8R#j`9&ag5n);BsBQt2Ub!umvV=zCrJtsq&)?oSny7Sjxat43%A^r%W7W%`@+ju~w)fh~F+I%%iaWBxD)UXW`{OCXGX-v!S(49i zy~U?F9rcOiH~ZHRhf7MHq3<7`I&>`2s!y$kPMOcpy~?K&kNKqH8FkM~D4oTa)9VuI MKV`f;owF+d0DgLb3IG5A literal 1821 zcmV+&2jciaRzV|IgCNucq0QGK4pamitBNYS&6%_=1Ec!CEMWAfOZHSOT%Hlr@0|}VN&9=L z1nUKfVraO?8v9~S`j4|Vsq&_7?{us@M6hmAiCiZlX&MnnkjMnB$oOE15Ru|gk%Z17 zHzYEZ?(Uw8xfj${tcZ<7`%4C=smrT}8f= zNbQCYKRWwDde7H(e0A1Qy5^CGxt9`~^%ow|>={S>wJP``O8hPo&hCRDyL$ z50M@bp!tL+KmI81R$1SQwU&x)-)|&>b+1HnPmxo=p~8(TgA$J{s!ScIW+z)Wbn0ED0d~x`(Y<~OngIKUe_Ce^fX6fcjA#Z)X`dnV|stu6~uwb3n7omQu zO1CUdjjT@2|KjhVyWZGL1gpBAh*RPs@M_N}W?c7y>)u;2xn%Y9t5dLGo%A?DrmiCo zy?v(QLS$WSOp0?)QUHI<;+MDe>& zn^#|AVHWIw>K>p5_!%`X95lZJO*Eh{1td1onfPl4Be^Mql)Chp`R;i*RDK=~d#Gwh zR@LcJl~h7?J8)Q0QBns@t^hh^HC3(2vbq9>q6-+BDqxg4@ewJvNs(Hc0!bm1Hei{8 zDouDvX0cA$nEEIo>%Ct1y$gC0cDMyu6sGINZB<-tBZF4hneJBW&n ziO-qmxy%fQ@+lw%$Y{as5#jc7T=8Ej#UHAl)K^tPua}|vI}if#9iWi!p?MNCNdfG@ zGKDDt&9_05oxqpCg+|TQ?XylRbgNI}j4V}RvvQP)KE3lIH!{xgjA>ZxD z4jXO0mC2e4u9tk9{+og`hStOfuR9P{h^Thbwcjf-SgzH=Ad){ zD;Jv!UCoQ|ZG6Y7V=BBK?Gx2iHyb>UZ zx(O1|EN^6Ntch;5SbTuQf<)c2LoYvu&gXe@hb}?hnc?>)bkS1iqB~ybqN5*%?qTTK zFLbT_8PRiX_vgLMk)H|fV_=`%L|K!&iTm_DIo2I$?@MuR`Mob0+<{i=)<@Ymd1Vbv zD(edK3jo*ZDNN+174hBJ8l?-Qsgv6@Ah!iUh8{xddqtiS3l@w$r4>ycgEuwkb3o2r z4@ywu5d{rA$Afzh;Nhh5FMXu`_oy&n?A;$u2H9qsAVD{L?DI^CC70ou5<6vNI4YN= zc$VcTj9x_4(T zH44YU1i5J+LjeoU1+7{kEaOF6v-~Lq>$n67oVoK z)hANu^RFRpm()B%KR7 { t.snapshot(lines[4], 'fifth row') t.snapshot(lines[5], 'sixth row') t.snapshot(lines[6], 'seventh row') - t.is(lines.length, 7, '7 rows') + t.snapshot(lines[7], 'eighth row') + t.snapshot(lines[8], 'ninth row') + t.is(lines.length, 9, '9 rows') t.end() }