瀏覽代碼

Initial version of csv parser without quotes.

It is *way* faster.
Jason Grout 7 年之前
父節點
當前提交
204847f8f1
共有 2 個文件被更改,包括 93 次插入18 次删除
  1. 24 6
      packages/csvviewer/src/model.ts
  2. 69 12
      packages/csvviewer/src/parse.ts

+ 24 - 6
packages/csvviewer/src/model.ts

@@ -6,7 +6,7 @@ import {
 } from '@phosphor/datagrid';
 
 import {
-  parseDSV// , STATE
+  parseDSV, parseDSVNoQuotes
 } from './parse';
 
 /*
@@ -48,6 +48,10 @@ class DSVModel extends DataModel {
 
     console.log(`Parsed initial ${this._rowCount} rows, ${this._rowCount * this._columnCount} values, in ${(end - start) / 1000}s`);
 
+    start = performance.now();
+    let i = data.indexOf('`');
+    end = performance.now();
+    console.log(`indexOf ${data.length} chars, found at ${i}, in ${(end - start) / 1000}s`);
   }
 
   /**
@@ -116,11 +120,14 @@ class DSVModel extends DataModel {
   }
 
   private _computeOffsets(maxRows = 4294967295) {
-    let {nrows, ncols, offsets} = parseDSV({data: this._data, delimiter: this._delimiter, columnOffsets: false, maxRows});
+    let {nrows, offsets} = this._parseDSV({data: this._data, delimiter: this._delimiter, columnOffsets: false, maxRows});
     if (offsets[offsets.length - 1] > 4294967296) {
       throw 'csv too large for offsets to be stored as 32-bit integers';
     }
 
+    // Get number of columns in first row
+    let {ncols} = this._parseDSV({data: this._data, delimiter: this._delimiter, columnOffsets: true, maxRows: 1});
+
     // If the full column offsets array is small enough, cache all of them.
     if (nrows * ncols <= this._columnOffsetsMaxSize) {
       this._rowOffsets = new Uint32Array(0);
@@ -165,7 +172,7 @@ class DSVModel extends DataModel {
       let maxRows = Math.min(this._maxCacheGet, rowsLeft);
 
       // Parse the data to get the column offsets.
-      let {offsets} = parseDSV({
+      let {offsets} = this._parseDSV({
         data: this._data,
         delimiter: this._delimiter,
         columnOffsets: true,
@@ -182,10 +189,8 @@ class DSVModel extends DataModel {
 
     // Return index from cache.
     return this._columnOffsets[rowIndex + column];
-
   }
 
-
   _getField(row: number, column: number) {
     let value: string;
     let index = this._getOffsetIndex(row, column);
@@ -234,10 +239,23 @@ class DSVModel extends DataModel {
         this._computeOffsets();
         let end = performance.now();
         console.log(`Parsed full ${this._rowCount} rows, ${this._rowCount * this._columnCount} values, in ${(end - start) / 1000}s`);
-      }, 0);
+      });
     }
   }
 
+  private _parseDSV(options: parseDSV.IOptions) {
+    let start = performance.now();
+    let {nrows, ncols, offsets} = parseDSV(options);
+    let end = performance.now();
+    console.log(`Parsed with dsv ${nrows} rows in ${(end - start) / 1000}s`);
+
+    start = performance.now();
+    let nnrows = parseDSVNoQuotes(options).nrows;
+    end = performance.now();
+    console.log(`Parsed with no quote dsv ${nnrows} rows in ${(end - start) / 1000}s`);
+    return {nrows, ncols, offsets};
+  }
+
   private _data: string;
   private _delimiter: string;
   private _rowDelimiter: string;

+ 69 - 12
packages/csvviewer/src/parse.ts

@@ -29,6 +29,7 @@
  * @param options: The function options
  * @returns an object representing nrows/ncols parsed, and an offset array of
  * either nrows*ncols entries, or nrows entries, depending on the input option.
+ * If computeOffsets is false, ncols returned will be 0.
  */
 export
 function parseDSV(options: parseDSV.IOptions): {nrows: number, ncols: number, offsets: number[]} {
@@ -93,7 +94,7 @@ function parseDSV(options: parseDSV.IOptions): {nrows: number, ncols: number, of
       if (nrows === maxRows) {
         // Could also do a labeled break to jump to the end, or could convert
         // this switch to an if/else and use a break to escape the while loop.
-        return {nrows, ncols, offsets};
+        return {nrows, ncols: columnOffsets ? ncols : 0, offsets};
       }
 
       // Push the row offset
@@ -220,7 +221,7 @@ function parseDSV(options: parseDSV.IOptions): {nrows: number, ncols: number, of
       throw 'state not recognized';
     }
   }
-  return {nrows, ncols, offsets};
+  return {nrows, ncols: columnOffsets ? ncols : 0, offsets};
 }
 
 export
@@ -257,6 +258,11 @@ namespace parseDSV {
      */
     delimiter?: string;
 
+    /**
+     * The line delimiter to use. Defaults to '\r\n'.
+     */
+    lineDelimiter?: string;
+
     /**
      * Whether to use a regex to shortcut processing. If false, use a loop-based
      * shortcut which sometimes is faster. Defaults to false.
@@ -304,18 +310,69 @@ namespace parseDSV {
   }
 }
 
+/**
+ * Optimized row offset parsing assuming there are no quotes.
+ */
 export
-function _parseDSVNoQuotes(data: string, delimiter: string) {
+function parseDSVNoQuotes(options: parseDSV.IOptions) {
+  const {
+    data,
+    delimiter = ',',
+    lineDelimiter = '\n',
+    startIndex = 0,
+    columnOffsets = false,
+    maxRows = 0xFFFFFFFF,
+  } = options;
+  // ncols will be set automatically if it is undefined.
+  let ncols = options.ncols;
+  let lineDelimiterLength = lineDelimiter.length;
+  let i = startIndex;
   let len = data.length;
-  let i = 0;
-  let offsets = [0];
-  let k = 0;
-  while (i < len) {
-    k = data.indexOf('\r\n', i);
-    if (k > 0) {
-      offsets.push(k);
+  let nextLine: number;
+  let col: number;
+  let offsets: number[] = [];
+  let nrows = 0;
+  let rowString: string;
+
+
+  let lineEnd: number;
+  nextLine = startIndex;
+  while (nextLine !== -1 && nrows < maxRows) {
+    offsets.push(i);
+    nrows++;
+    nextLine = data.indexOf(lineDelimiter, i);
+    lineEnd = nextLine === -1 ? len : nextLine;
+
+    if (columnOffsets === true) {
+      // Assumes the slice is a zero-cost view. Otherwise it may be better to
+      // just indexOf our way through until we pass stop or go negative,
+      // possibly overshooting the end of the line.
+      col = 1;
+      rowString = data.slice(i, lineEnd);
+      i = rowString.indexOf(delimiter);
+
+      if (ncols === undefined) {
+        while (i !== -1) {
+          offsets.push(startIndex + i);
+          col++;
+          i = rowString.indexOf(delimiter, i + 1);
+        }
+        ncols = col;
+      } else {
+        while (i !== -1 && col <= ncols) {
+          offsets.push(startIndex + i);
+          col++;
+          i = rowString.indexOf(delimiter, i + 1);
+        }
+        if (col < ncols) {
+          for (; col <= ncols; col++) {
+            offsets.push(nextLine);
+          }
+        }
+      }
     }
-    i = k;
+    i = lineEnd + lineDelimiterLength;
   }
-  return offsets;
+
+  return {nrows, ncols: columnOffsets ? ncols : 0, offsets};
 }