前端實現(xiàn)CSV文件解析的方法詳解
基本規(guī)則

簡單來說就:使用逗號分隔數(shù)據(jù),當(dāng)出現(xiàn)沖突的使用雙引號包裹沖突數(shù)據(jù)來解決沖突(沒有沖突也可以使用雙引號包裹數(shù)據(jù))。通過逗號將數(shù)據(jù)分隔成列,通過 \n 換行符將數(shù)據(jù)分隔成行,因此 CSV 格式可以用來表示二維表格數(shù)據(jù)。
CSV 解析
根據(jù)上面的格式,簡單寫一個 CSV 解析器。思路其實很簡單,就是先按 \n 進行分行,再按 , 進行分列。只不過需要注意的是:在分列的時候遇到 " 要將雙引號的內(nèi)容作為一個整體。具體代碼如下:
// 特殊符號枚舉
const SIGN = {
rowDelimiter: '\n',
colDelimiter: ',',
specialCharacter: '"',
}
const parseCsv = (str = '') =>
str.split(SIGN.rowDelimiter).map((row) => {
const chunk = []
let doubleQuoteIsClose = true
let unit = ''
for (let i = 0; i < row.length; i++) {
const s = row[i]
if (s === SIGN.colDelimiter && doubleQuoteIsClose) {
// 去除首尾 ",這兩個雙引號可能是由于逗號沖突而添加的
unit = unit.replace(/^"|"$/g, '')
chunk.push(unit)
unit = ''
continue
}
if (s === SIGN.specialCharacter) {
doubleQuoteIsClose = !doubleQuoteIsClose
}
unit += s
}
// 收集末尾的 unit
if (unit) {
unit = unit.replace(/^"|"$/g, '')
chunk.push(unit)
}
return chunk
})
通過上面處理,我們可以將一個 csv 格式的數(shù)據(jù)解析成一個二維數(shù)組。其實 csv 數(shù)據(jù)組織格式的核心很簡單:使用 \n 分隔行,使用 , 分隔列,使用 " 作為特殊字符來解決字符沖突。 至于是否組裝 header,以及一些優(yōu)化處理就比較簡單了。
PapaParse 源碼分析
接下來我們來看一款成熟的工具:PapaParse。PapaParse 有豐富的使用文檔,并且在 GitHub 上有 12k 的 star,npm 的周下載量也在非常高的兩百多萬,是一款比較推薦的 csv 解析庫。
解析相關(guān)的核心源碼我放在了附錄。我把它的解析部分分成兩個部分,一種是不包含雙引號的情況,代碼如下:
// Next delimiter comes before next newline, so we've reached end of field
if (
nextDelim !== -1 &&
(nextDelim < nextNewline || nextNewline === -1)
) {
row.push(input.substring(cursor, nextDelim))
cursor = nextDelim + delimLen
// we look for next delimiter char
nextDelim = input.indexOf(delim, cursor)
continue
}
// End of row
if (nextNewline !== -1) {
row.push(input.substring(cursor, nextNewline))
saveRow(nextNewline + newlineLen)
if (stepIsFunction) {
doStep()
if (aborted) return returnable()
}
if (preview && data.length >= preview) return returnable(true)
continue
}
其實也就是簡單的按分隔符切割字符串。
另外一種是包含雙引號,值得一提的是:如果包含雙引號,那么開頭和結(jié)尾一定是雙引號,那么關(guān)于雙引號中間部分的解析主要需要注意的是內(nèi)部可能包含轉(zhuǎn)義字符。比如:
// If this quote is escaped, it's part of the data; skip it
// If the quote character is the escape character, then check if the next character is the escape character
if (quoteChar === escapeChar && input[quoteSearch + 1] === escapeChar) {
quoteSearch++
continue
}
// If the quote character is not the escape character, then check if the previous character was the escape character
if (
quoteChar !== escapeChar &&
quoteSearch !== 0 &&
input[quoteSearch - 1] === escapeChar
) {
continue
}
它的這個邏輯對于轉(zhuǎn)義字符的判斷更加細膩,不過我們之前的 double 應(yīng)該也沒有太大的問題。
if (s === SIGN.specialCharacter) {
doubleQuoteIsClose = !doubleQuoteIsClose
}
所以 csv 數(shù)據(jù)格式解析的核心邏輯其實是很簡單的。
附錄(解析相關(guān)的源碼)
this.parse = function (input, baseIndex, ignoreLastRow) {
// For some reason, in Chrome, this speeds things up (!?)
if (typeof input !== 'string') throw new Error('Input must be a string')
// We don't need to compute some of these every time parse() is called,
// but having them in a more local scope seems to perform better
var inputLen = input.length,
delimLen = delim.length,
newlineLen = newline.length,
commentsLen = comments.length
var stepIsFunction = isFunction(step)
// Establish starting state
cursor = 0
var data = [],
errors = [],
row = [],
lastCursor = 0
if (!input) return returnable()
// Rename headers if there are duplicates
var firstLine
if (config.header && !baseIndex) {
firstLine = input.split(newline)[0]
var headers = firstLine.split(delim)
var separator = '_'
var headerMap = new Set()
var headerCount = {}
var duplicateHeaders = false
// Using old-style 'for' loop to avoid prototype pollution that would be picked up with 'var j in headers'
for (var j = 0; j < headers.length; j++) {
var header = headers[j]
if (isFunction(config.transformHeader))
header = config.transformHeader(header, j)
var headerName = header
var count = headerCount[header] || 0
if (count > 0) {
duplicateHeaders = true
headerName = header + separator + count
// Initialise the variable if it hasn't been.
if (renamedHeaders === null) {
renamedHeaders = {}
}
}
headerCount[header] = count + 1
// In case it already exists, we add more separators
while (headerMap.has(headerName)) {
headerName = headerName + separator + count
}
headerMap.add(headerName)
if (count > 0) {
renamedHeaders[headerName] = header
}
}
if (duplicateHeaders) {
var editedInput = input.split(newline)
editedInput[0] = Array.from(headerMap).join(delim)
input = editedInput.join(newline)
}
}
if (fastMode || (fastMode !== false && input.indexOf(quoteChar) === -1)) {
var rows = input.split(newline)
for (var i = 0; i < rows.length; i++) {
row = rows[i]
// use firstline as row length may be changed due to duplicated headers
if (i === 0 && firstLine !== undefined) {
cursor += firstLine.length
} else {
cursor += row.length
}
if (i !== rows.length - 1) cursor += newline.length
else if (ignoreLastRow) return returnable()
if (comments && row.substring(0, commentsLen) === comments) continue
if (stepIsFunction) {
data = []
pushRow(row.split(delim))
doStep()
if (aborted) return returnable()
} else pushRow(row.split(delim))
if (preview && i >= preview) {
data = data.slice(0, preview)
return returnable(true)
}
}
return returnable()
}
var nextDelim = input.indexOf(delim, cursor)
var nextNewline = input.indexOf(newline, cursor)
var quoteCharRegex = new RegExp(
escapeRegExp(escapeChar) + escapeRegExp(quoteChar),
'g'
)
var quoteSearch = input.indexOf(quoteChar, cursor)
// Parser loop
for (;;) {
// Field has opening quote
if (input[cursor] === quoteChar) {
// Start our search for the closing quote where the cursor is
quoteSearch = cursor
// Skip the opening quote
cursor++
for (;;) {
// Find closing quote
quoteSearch = input.indexOf(quoteChar, quoteSearch + 1)
//No other quotes are found - no other delimiters
if (quoteSearch === -1) {
if (!ignoreLastRow) {
// No closing quote... what a pity
errors.push({
type: 'Quotes',
code: 'MissingQuotes',
message: 'Quoted field unterminated',
row: data.length, // row has yet to be inserted
index: cursor,
})
}
return finish()
}
// Closing quote at EOF
if (quoteSearch === inputLen - 1) {
var value = input
.substring(cursor, quoteSearch)
.replace(quoteCharRegex, quoteChar)
return finish(value)
}
// If this quote is escaped, it's part of the data; skip it
// If the quote character is the escape character, then check if the next character is the escape character
// 連續(xù)兩個雙引號,表示轉(zhuǎn)譯的意思
if (quoteChar === escapeChar && input[quoteSearch + 1] === escapeChar) {
quoteSearch++
continue
}
// If the quote character is not the escape character, then check if the previous character was the escape character
if (
quoteChar !== escapeChar &&
quoteSearch !== 0 &&
input[quoteSearch - 1] === escapeChar
) {
continue
}
// 說明匹配到 " 結(jié)束符號
if (nextDelim !== -1 && nextDelim < quoteSearch + 1) {
nextDelim = input.indexOf(delim, quoteSearch + 1)
}
if (nextNewline !== -1 && nextNewline < quoteSearch + 1) {
nextNewline = input.indexOf(newline, quoteSearch + 1)
}
// Check up to nextDelim or nextNewline, whichever is closest
var checkUpTo =
nextNewline === -1 ? nextDelim : Math.min(nextDelim, nextNewline)
var spacesBetweenQuoteAndDelimiter = extraSpaces(checkUpTo)
// Closing quote followed by delimiter or 'unnecessary spaces + delimiter'
// 跳過空格
if (
input.substr(
quoteSearch + 1 + spacesBetweenQuoteAndDelimiter,
delimLen
) === delim
) {
row.push(
input
.substring(cursor, quoteSearch)
.replace(quoteCharRegex, quoteChar)
)
cursor = quoteSearch + 1 + spacesBetweenQuoteAndDelimiter + delimLen
// If char after following delimiter is not quoteChar, we find next quote char position
if (
input[
quoteSearch + 1 + spacesBetweenQuoteAndDelimiter + delimLen
] !== quoteChar
) {
quoteSearch = input.indexOf(quoteChar, cursor)
}
nextDelim = input.indexOf(delim, cursor)
nextNewline = input.indexOf(newline, cursor)
break
}
var spacesBetweenQuoteAndNewLine = extraSpaces(nextNewline)
// Closing quote followed by newline or 'unnecessary spaces + newLine'
if (
input.substring(
quoteSearch + 1 + spacesBetweenQuoteAndNewLine,
quoteSearch + 1 + spacesBetweenQuoteAndNewLine + newlineLen
) === newline
) {
row.push(
input
.substring(cursor, quoteSearch)
.replace(quoteCharRegex, quoteChar)
)
saveRow(quoteSearch + 1 + spacesBetweenQuoteAndNewLine + newlineLen)
nextDelim = input.indexOf(delim, cursor) // because we may have skipped the nextDelim in the quoted field
quoteSearch = input.indexOf(quoteChar, cursor) // we search for first quote in next line
if (stepIsFunction) {
doStep()
if (aborted) return returnable()
}
if (preview && data.length >= preview) return returnable(true)
break
}
// Checks for valid closing quotes are complete (escaped quotes or quote followed by EOF/delimiter/newline) -- assume these quotes are part of an invalid text string
errors.push({
type: 'Quotes',
code: 'InvalidQuotes',
message: 'Trailing quote on quoted field is malformed',
row: data.length, // row has yet to be inserted
index: cursor,
})
quoteSearch++
continue
}
continue
}
// Comment found at start of new line
if (
comments &&
row.length === 0 &&
input.substring(cursor, cursor + commentsLen) === comments
) {
if (nextNewline === -1)
// Comment ends at EOF
return returnable()
cursor = nextNewline + newlineLen
nextNewline = input.indexOf(newline, cursor)
nextDelim = input.indexOf(delim, cursor)
continue
}
// Next delimiter comes before next newline, so we've reached end of field
if (nextDelim !== -1 && (nextDelim < nextNewline || nextNewline === -1)) {
row.push(input.substring(cursor, nextDelim))
cursor = nextDelim + delimLen
// we look for next delimiter char
nextDelim = input.indexOf(delim, cursor)
continue
}
// End of row
if (nextNewline !== -1) {
row.push(input.substring(cursor, nextNewline))
saveRow(nextNewline + newlineLen)
if (stepIsFunction) {
doStep()
if (aborted) return returnable()
}
if (preview && data.length >= preview) return returnable(true)
continue
}
break
}
return finish()
function pushRow(row) {
data.push(row)
lastCursor = cursor
}
/**
* checks if there are extra spaces after closing quote and given index without any text
* if Yes, returns the number of spaces
*/
function extraSpaces(index) {
var spaceLength = 0
if (index !== -1) {
var textBetweenClosingQuoteAndIndex = input.substring(
quoteSearch + 1,
index
)
if (
textBetweenClosingQuoteAndIndex &&
textBetweenClosingQuoteAndIndex.trim() === ''
) {
spaceLength = textBetweenClosingQuoteAndIndex.length
}
}
return spaceLength
}
/**
* Appends the remaining input from cursor to the end into
* row, saves the row, calls step, and returns the results.
*/
function finish(value) {
if (ignoreLastRow) return returnable()
if (typeof value === 'undefined') value = input.substring(cursor)
row.push(value)
cursor = inputLen // important in case parsing is paused
pushRow(row)
if (stepIsFunction) doStep()
return returnable()
}
/**
* Appends the current row to the results. It sets the cursor
* to newCursor and finds the nextNewline. The caller should
* take care to execute user's step function and check for
* preview and end parsing if necessary.
*/
function saveRow(newCursor) {
cursor = newCursor
pushRow(row)
row = []
nextNewline = input.indexOf(newline, cursor)
}
/** Returns an object with the results, errors, and meta. */
function returnable(stopped) {
return {
data: data,
errors: errors,
meta: {
delimiter: delim,
linebreak: newline,
aborted: aborted,
truncated: !!stopped,
cursor: lastCursor + (baseIndex || 0),
renamedHeaders: renamedHeaders,
},
}
}
/** Executes the user's step function and resets data & errors. */
function doStep() {
step(returnable())
data = []
errors = []
}
}
到此這篇關(guān)于前端實現(xiàn)CSV文件解析的方法詳解的文章就介紹到這了,更多相關(guān)CSV文件解析內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
JavaScript變量類型以及變量之間的轉(zhuǎn)換你了解嗎
這篇文章主要為大家詳細介紹了JavaScript變量類型以及變量之間的轉(zhuǎn)換,文中示例代碼介紹的非常詳細,具有一定的參考價值,感興趣的小伙伴們可以參考一下,希望能夠給你帶來幫助2022-02-02
JavaScript高級程序設(shè)計 事件學(xué)習(xí)筆記
JavaScript高級程序設(shè)計 事件學(xué)習(xí)筆記,需要的朋友可以參考下。2011-09-09
JavaScript拆分字符串時產(chǎn)生空字符的解決方案
使用JavaScript的split方法拆分字符串時出現(xiàn)一些空字符串"",尤其是當(dāng)使用正則表達式作為分隔符的時候。那么,產(chǎn)生這些空字符串的原因是什么?又該如何來處理呢,這就是今天我們要探討的問題2014-09-09
JavaScript極簡入門教程(二):對象和函數(shù)
這篇文章主要介紹了JavaScript極簡入門教程(二):對象和函數(shù),本文講解了對象基礎(chǔ)知識、函數(shù)基礎(chǔ)知識、函數(shù)調(diào)用、異常、繼承等內(nèi)容,需要的朋友可以參考下2014-10-10

