Bladeren bron

Merge pull request #3997 from jasongrout/csvoffset

Initial implementation of a fast CSV model.
Afshin Darian 7 jaren geleden
bovenliggende
commit
f775baa658

+ 1 - 0
buildutils/src/ensure-repo.ts

@@ -29,6 +29,7 @@ let UNUSED: { [key: string]: string[] } = {
   '@jupyterlab/theme-dark-extension': ['font-awesome'],
   '@jupyterlab/theme-light-extension': ['font-awesome'],
   '@jupyterlab/services': ['node-fetch', 'ws'],
+  '@jupyterlab/test-csvviewer': ['csv-spectrum'],
   '@jupyterlab/vega3-extension': ['vega', 'vega-lite']
 };
 

+ 6 - 4
docs/source/user/file_formats.rst

@@ -107,10 +107,12 @@ CSV
 -  File extension: ``.csv``
 -  MIME type: None
 
-Files with rows of comma-separated values (CSV files) are a common
-format for tabular data. The default viewer for CSV files in JupyterLab
-is a high-performance data grid viewer (which can also handle tab- and
-semicolon-separated values):
+Files with rows of comma-separated values (CSV files) are a common format for
+tabular data. The default viewer for CSV files in JupyterLab is a
+high-performance data grid viewer (which can also handle tab- and
+semicolon-separated values). JupyterLab can open files up to the maximum string
+size in the browser (which ranges from approximately 250MB to 2GB, depending on
+the browser):
 
 .. raw:: html
 

+ 2 - 3
packages/csvviewer/package.json

@@ -36,13 +36,12 @@
     "@phosphor/algorithm": "^1.1.2",
     "@phosphor/coreutils": "^1.3.0",
     "@phosphor/datagrid": "^0.1.5",
+    "@phosphor/disposable": "^1.1.2",
     "@phosphor/messaging": "^1.2.2",
     "@phosphor/signaling": "^1.2.2",
-    "@phosphor/widgets": "^1.5.0",
-    "d3-dsv": "~1.0.7"
+    "@phosphor/widgets": "^1.5.0"
   },
   "devDependencies": {
-    "@types/d3-dsv": "~1.0.30",
     "rimraf": "~2.6.2",
     "typescript": "~2.6.2"
   }

+ 2 - 0
packages/csvviewer/src/index.ts

@@ -3,5 +3,7 @@
 
 import '../style/index.css';
 
+export * from './model';
+export * from './parse';
 export * from './toolbar';
 export * from './widget';

+ 675 - 0
packages/csvviewer/src/model.ts

@@ -0,0 +1,675 @@
+// Copyright (c) Jupyter Development Team.
+// Distributed under the terms of the Modified BSD License.
+
+import {
+  DataModel
+} from '@phosphor/datagrid';
+
+import {
+  IDisposable
+} from '@phosphor/disposable';
+
+import {
+  PromiseDelegate
+} from '@phosphor/coreutils';
+
+import {
+  parseDSV, parseDSVNoQuotes, IParser
+} from './parse';
+
+/*
+Possible ideas for further implementation:
+
+- Show a spinner or something visible when we are doing delayed parsing.
+- The cache right now handles scrolling down great - it gets the next several hundred rows. However, scrolling up causes lots of cache misses - each new row causes a flush of the cache. When invalidating an entire cache, we should put the requested row in middle of the cache (adjusting for rows at the beginning or end). When populating a cache, we should retrieve rows both above and below the requested row.
+- When we have a header, and we are guessing the parser to use, try checking just the part of the file *after* the header row for quotes. I think often a first header row is quoted, but the rest of the file is not and can be parsed much faster.
+- autdetect the delimiter (look for comma, tab, semicolon in first line. If more than one found, parse first row with comma, tab, semicolon delimiters. One with most fields wins).
+- Toolbar buttons to control the row delimiter, the parsing engine (quoted/not quoted), the quote character, etc.
+- Investigate incremental loading strategies in the parseAsync function. In initial investigations, setting the chunk size to 100k in parseAsync seems cause instability with large files in Chrome (such as 8-million row files). Perhaps this is because we are recycling the row offset and column offset arrays quickly? It doesn't seem that there is a memory leak. On this theory, perhaps we just need to keep the offsets list an actual list, and pass it into the parsing function to extend without copying, and finalize it into an array buffer only when we are done parsing. Or perhaps we double the size of the array buffer each time, which may be wasteful, but at the end we trim it down if it's too wasteful (perhaps we have our own object that is backed by an array buffer, but has a push method that will automatically double the array buffer size as needed, and a trim function to finalize the array to exactly the size needed)? Or perhaps we don't use array buffers at all - compare the memory cost and speed of keeping the offsets as lists instead of memory buffers.
+- Investigate a time-based incremental parsing strategy, rather than a row-based one. The parser could take a maximum time to parse (say 300ms), and will parse up to that duration, in which case the parser probably also needs a way to notify when it has reached the end of a file.
+- For very large files, where we are only storing a small cache, scrolling is very laggy in Safari. It would be good to profile it.
+*/
+
+/**
+ * Possible delimiter-separated data parsers.
+ */
+const PARSERS: {[key: string]: IParser} = {
+  'quotes': parseDSV,
+  'noquotes': parseDSVNoQuotes
+};
+
+/**
+ * A data model implementation for in-memory delimiter-separated data.
+ *
+ * #### Notes
+ * This model handles data with up to 2**32 characters.
+ */
+export
+class DSVModel extends DataModel implements IDisposable {
+  /**
+   * Create a data model with static CSV data.
+   *
+   * @param options - The options for initializing the data model.
+   */
+  constructor(options: DSVModel.IOptions) {
+    super();
+    let {
+      data,
+      delimiter=',',
+      rowDelimiter = undefined,
+      quote = '"',
+      quoteParser = undefined,
+      header = true,
+      initialRows = 500
+    } = options;
+    this._data = data;
+    this._delimiter = delimiter;
+    this._quote = quote;
+    this._quoteEscaped = new RegExp(quote + quote, 'g');
+    this._initialRows = initialRows;
+
+    // Guess the row delimiter if it was not supplied. This will be fooled if a
+    // different line delimiter possibility appears in the first row.
+    if (rowDelimiter === undefined) {
+      let i = data.slice(0, 5000).indexOf('\r');
+      if (i === -1) {
+        rowDelimiter = '\n';
+      } else if (data[i + 1] === '\n') {
+        rowDelimiter = '\r\n';
+      } else {
+        rowDelimiter = '\r';
+      }
+    }
+    this._rowDelimiter = rowDelimiter;
+
+    if (quoteParser === undefined) {
+      // Check for the existence of quotes if the quoteParser is not set.
+      quoteParser = (data.indexOf(quote) >= 0);
+    }
+    this._parser = quoteParser ? 'quotes' : 'noquotes';
+
+    // Parse the data.
+    this._parseAsync();
+
+    // Cache the header row.
+    if (header === true && this._columnCount > 0) {
+      let h = [];
+      for (let c = 0; c < this._columnCount; c++) {
+        h.push(this._getField(0, c));
+      }
+      this._header = h;
+    }
+  }
+
+  /**
+   * Whether this model has been disposed.
+   */
+  get isDisposed(): boolean {
+    return this._isDisposed;
+  }
+
+  /**
+   * A promise that resolves when the model has parsed all of its data.
+   */
+  get ready(): Promise<void> {
+    return this._ready.promise;
+  }
+
+  /**
+   * Get the row count for a region in the data model.
+   *
+   * @param region - The row region of interest.
+   *
+   * @returns - The row count for the region.
+   */
+  rowCount(region: DataModel.RowRegion): number {
+    if (region === 'body') {
+      if (this._header.length === 0) {
+        return this._rowCount;
+      } else {
+        return this._rowCount - 1;
+      }
+    }
+    return 1;
+  }
+
+  /**
+   * Get the column count for a region in the data model.
+   *
+   * @param region - The column region of interest.
+   *
+   * @returns - The column count for the region.
+   */
+  columnCount(region: DataModel.ColumnRegion): number {
+    if (region === 'body') {
+      return this._columnCount;
+    }
+    return 1;
+  }
+
+  /**
+   * Get the data value for a cell in the data model.
+   *
+   * @param region - The cell region of interest.
+   *
+   * @param row - The row index of the cell of interest.
+   *
+   * @param column - The column index of the cell of interest.
+   *
+   * @param returns - The data value for the specified cell.
+   */
+  data(region: DataModel.CellRegion, row: number, column: number): string {
+    let value: string;
+
+    // Look up the field and value for the region.
+    switch (region) {
+    case 'body':
+      if (this._header.length === 0) {
+        value = this._getField(row, column);
+      } else {
+        value = this._getField(row + 1, column);
+      }
+      break;
+    case 'column-header':
+      if (this._header.length === 0) {
+        value = (column + 1).toString();
+      } else {
+        value = this._header[column];
+      }
+      break;
+    case 'row-header':
+      value = (row + 1).toString();
+      break;
+    case 'corner-header':
+      value = '';
+      break;
+    default:
+      throw 'unreachable';
+    }
+
+    // Return the final value.
+    return value;
+  }
+
+  /**
+   * Dispose the resources held by this model.
+   */
+  dispose(): void {
+    if (this._isDisposed) {
+      return;
+    }
+
+    this._columnCount = undefined;
+    this._rowCount = undefined;
+    this._rowOffsets = null;
+    this._columnOffsets = null;
+    this._data = null;
+
+    // Clear out state associated with the asynchronous parsing.
+    if (this._doneParsing === false) {
+      // Explicitly catch this rejection at least once so an error is not thrown
+      // to the console.
+      this.ready.catch(() => { return; });
+      this._ready.reject(undefined);
+    }
+    if (this._delayedParse !== null) {
+      window.clearTimeout(this._delayedParse);
+    }
+  }
+
+  /**
+   * Compute the row offsets and initialize the column offset cache.
+   *
+   * @param endRow - The last row to parse, from the start of the data (first
+   * row is row 1).
+   *
+   * #### Notes
+   * This method supports parsing the data incrementally by calling it with
+   * incrementally higher endRow. Rows that have already been parsed will not be
+   * parsed again.
+   */
+  private _computeRowOffsets(endRow = 4294967295): void {
+    // If we've already parsed up to endRow, or if we've already parsed the
+    // entire data set, return early.
+    if (this._rowCount >= endRow || this._doneParsing === true) {
+      return;
+    }
+
+    // Compute the column count if we don't already have it.
+    if (this._columnCount === undefined) {
+      // Get number of columns in first row
+      this._columnCount = (PARSERS[this._parser]({
+        data: this._data,
+        delimiter: this._delimiter,
+        rowDelimiter: this._rowDelimiter,
+        quote: this._quote,
+        columnOffsets: true,
+        maxRows: 1
+      })).ncols;
+    }
+
+    // Parse the data up to and including the requested row, starting from the
+    // last row offset we have.
+    let {nrows, offsets} = PARSERS[this._parser]({
+      data: this._data,
+      startIndex: this._rowOffsets[this._rowCount - 1],
+      delimiter: this._delimiter,
+      rowDelimiter: this._rowDelimiter,
+      quote: this._quote,
+      columnOffsets: false,
+      maxRows: endRow - this._rowCount + 1
+    });
+
+    // Return if we didn't actually get any new rows beyond the one we've
+    // already parsed.
+    if (nrows <= 1) {
+      this._doneParsing = true;
+      this._ready.resolve(undefined);
+      return;
+    }
+
+    // Update the row count.
+    let oldRowCount = this._rowCount;
+    this._rowCount = oldRowCount + nrows - 1;
+
+    // If we didn't reach the requested row, we must be done.
+    if (this._rowCount < endRow) {
+      this._doneParsing = true;
+      this._ready.resolve(undefined);
+    }
+
+    // Copy the new offsets into a new row offset array.
+    let oldRowOffsets = this._rowOffsets;
+    this._rowOffsets = new Uint32Array(this._rowCount);
+    this._rowOffsets.set(oldRowOffsets);
+    this._rowOffsets.set(offsets, oldRowCount - 1);
+
+    // Expand the column offsets array if needed
+
+    // If the full column offsets array is small enough, build a cache big
+    // enough for all column offsets. We allocate up to 128 megabytes:
+    // 128*(2**20 bytes/M)/(4 bytes/entry) = 33554432 entries.
+    let maxColumnOffsetsRows = Math.floor(33554432 / this._columnCount);
+
+    // We need to expand the column offset array if we were storing all column
+    // offsets before. Check to see if the previous size was small enough that
+    // we stored all column offsets.
+    if (oldRowCount <= maxColumnOffsetsRows) {
+      // Check to see if the new column offsets array is small enough to still
+      // store, or if we should cut over to a small cache.
+      if (this._rowCount <= maxColumnOffsetsRows) {
+        // Expand the existing column offset array for new column offsets.
+        let oldColumnOffsets = this._columnOffsets;
+        this._columnOffsets = new Uint32Array(this._rowCount * this._columnCount);
+        this._columnOffsets.set(oldColumnOffsets);
+        this._columnOffsets.fill(0xFFFFFFFF, oldColumnOffsets.length);
+      } else {
+        // If not, then our cache size is at most the maximum number of rows we
+        // fill in the cache at a time.
+        let oldColumnOffsets = this._columnOffsets;
+        this._columnOffsets = new Uint32Array(Math.min(this._maxCacheGet, maxColumnOffsetsRows) * this._columnCount);
+
+        // Fill in the entries we already have.
+        this._columnOffsets.set(oldColumnOffsets.subarray(0, this._columnOffsets.length));
+
+        // Invalidate the rest of the entries.
+        this._columnOffsets.fill(0xFFFFFFFF, oldColumnOffsets.length);
+        this._columnOffsetsStartingRow = 0;
+      }
+    }
+
+    // We have more rows than before, so emit the rows-inserted change signal.
+    let firstIndex = oldRowCount;
+    if (this._header.length > 0) {
+      firstIndex -= 1;
+    }
+    this.emitChanged({
+      type: 'rows-inserted',
+      region: 'body',
+      index: firstIndex,
+      span: this._rowCount - oldRowCount
+    });
+  }
+
+  /**
+   * Get the parsed string field for a row and column.
+   *
+   * @param row - The row number of the data item.
+   * @param column - The column number of the data item.
+   * @returns The parsed string for the data item.
+   */
+  private _getField(row: number, column: number): string {
+    // Declare local variables.
+    let value: string;
+    let nextIndex;
+
+    // Find the index for the first character in the field.
+    let index = this._getOffsetIndex(row, column);
+
+    // Initialize the trim adjustments.
+    let trimRight = 0;
+    let trimLeft = 0;
+
+    // Find the end of the slice (the start of the next field), and how much we
+    // should adjust to trim off a trailing field or row delimiter. First check
+    // if we are getting the last column.
+    if (column === this._columnCount - 1) {
+      // Check if we are getting any row but the last.
+      if (row < this._rowCount - 1) {
+        // Set the next offset to the next row, column 0.
+        nextIndex = this._getOffsetIndex(row + 1, 0);
+
+        // Since we are not at the last row, we need to trim off the row
+        // delimiter.
+        trimRight += this._rowDelimiter.length;
+      } else {
+        // We are getting the last data item, so the slice end is the end of the
+        // data string.
+        nextIndex = this._data.length;
+
+        // The string may or may not end in a row delimiter (RFC 4180 2.2), so
+        // we explicitly check if we should trim off a row delimiter.
+        if (this._data[nextIndex - 1] === this._rowDelimiter[this._rowDelimiter.length - 1]) {
+          trimRight += this._rowDelimiter.length;
+        }
+      }
+    } else {
+      // The next field starts at the next column offset.
+      nextIndex = this._getOffsetIndex(row, column + 1);
+
+      // Trim off the delimiter if it exists at the end of the field
+      if (index < nextIndex && this._data[nextIndex - 1] === this._delimiter) {
+        trimRight += 1;
+      }
+    }
+
+    // Check to see if the field begins with a quote. If it does, trim a quote on either side.
+    if (this._data[index] === this._quote) {
+      trimLeft += 1;
+      trimRight += 1;
+    }
+
+    // Slice the actual value out of the data string.
+    value = this._data.slice(index + trimLeft, nextIndex - trimRight);
+
+    // If we have a quoted field and we have an escaped quote inside it, unescape it.
+    if (trimLeft === 1 && value.indexOf(this._quote) !== -1) {
+      value = value.replace(this._quoteEscaped, this._quote);
+    }
+
+    // Return the value.
+    return value;
+  }
+
+  /**
+   * Get the index in the data string for the first character of a row and
+   * column.
+   *
+   * @param row - The row of the data item.
+   * @param column - The column of the data item.
+   * @returns - The index into the data string where the data item starts.
+   */
+  private _getOffsetIndex(row: number, column: number): number {
+    // Declare local variables.
+    const ncols = this._columnCount;
+
+    // Check to see if row *should* be in the cache, based on the cache size.
+    let rowIndex = (row - this._columnOffsetsStartingRow) * ncols;
+    if (rowIndex < 0 || rowIndex > this._columnOffsets.length) {
+      // Row isn't in the cache, so we invalidate the entire cache and set up
+      // the cache to hold the requested row.
+      this._columnOffsets.fill(0xFFFFFFFF);
+      this._columnOffsetsStartingRow = row;
+      rowIndex = 0;
+    }
+
+    // Check to see if we need to fetch the row data into the cache.
+    if (this._columnOffsets[rowIndex] === 0xFFFFFFFF) {
+      // Figure out how many rows below us also need to be fetched.
+      let maxRows = 1;
+      while (maxRows <= this._maxCacheGet && this._columnOffsets[rowIndex + maxRows * ncols] === 0xFFFFFF) {
+        maxRows++;
+      }
+
+      // Parse the data to get the column offsets.
+      let {offsets} = PARSERS[this._parser]({
+        data: this._data,
+        delimiter: this._delimiter,
+        rowDelimiter: this._rowDelimiter,
+        quote: this._quote,
+        columnOffsets: true,
+        maxRows: maxRows,
+        ncols: ncols,
+        startIndex: this._rowOffsets[row]
+      });
+
+      // Copy results to the cache.
+      for (let i = 0; i < offsets.length; i++) {
+        this._columnOffsets[rowIndex + i] = offsets[i];
+      }
+    }
+
+    // Return the offset index from cache.
+    return this._columnOffsets[rowIndex + column];
+  }
+
+  /**
+   * Parse the data string asynchronously.
+   *
+   * #### Notes
+   * It can take several seconds to parse a several hundred megabyte string, so
+   * we parse the first 500 rows to get something up on the screen, then we
+   * parse the full data string asynchronously.
+   */
+  private _parseAsync(): void {
+    // Number of rows to get initially.
+    let currentRows = this._initialRows;
+
+    // Number of rows to get in each chunk thereafter. We set this high to just
+    // get the rest of the rows for now.
+    let chunkRows = Math.pow(2, 32) - 1;
+
+    // We give the UI a chance to draw by delaying the chunk parsing.
+    let delay = 30; // milliseconds
+
+    // Define a function to parse a chunk up to and including endRow.
+    let parseChunk = (endRow: number) => {
+      try {
+        this._computeRowOffsets(endRow);
+      } catch (e) {
+        // Sometimes the data string cannot be parsed with the full parser (for
+        // example, we may have the wrong delimiter). In these cases, fall back to
+        // the simpler parser so we can show something.
+        if (this._parser === 'quotes') {
+          console.warn(e);
+          this._parser = 'noquotes';
+          this._resetParser();
+          this._computeRowOffsets(endRow);
+        } else {
+          throw e;
+        }
+      }
+      return this._doneParsing;
+    };
+
+    // Reset the parser to its initial state.
+    this._resetParser();
+
+    // Parse the first rows to give us the start of the data right away.
+    let done = parseChunk(currentRows);
+
+    // If we are done, return early.
+    if (done) {
+      return;
+    }
+
+    // Define a function to recursively parse the next chunk after a delay.
+    let delayedParse = () => {
+      // Parse up to the new end row.
+      let done = parseChunk(currentRows + chunkRows);
+      currentRows += chunkRows;
+
+      // Gradually double the chunk size until we reach a million rows, if we
+      // start below a million-row chunk size.
+      if (chunkRows < 1000000) {
+        chunkRows *= 2;
+      }
+
+      // If we aren't done, the schedule another parse.
+      if (done) {
+        this._delayedParse = null;
+      } else {
+        this._delayedParse = window.setTimeout(delayedParse, delay);
+      }
+    };
+
+    // Parse full data string in chunks, delayed by a few milliseconds to give the UI a chance to draw.
+    this._delayedParse = window.setTimeout(delayedParse, delay);
+  }
+
+  /**
+   * Reset the parser state.
+   */
+  private _resetParser(): void {
+    this._columnCount = undefined;
+
+    // First row offset is *always* 0, so we always have the first row offset.
+    this._rowOffsets = new Uint32Array(1);
+    this._rowCount = 1;
+
+    this._columnOffsets = new Uint32Array(0);
+
+    // Clear out state associated with the asynchronous parsing.
+    if (this._doneParsing === false) {
+      // Explicitly catch this rejection at least once so an error is not thrown
+      // to the console.
+      this.ready.catch(() => { return; });
+      this._ready.reject(undefined);
+    }
+    this._doneParsing = false;
+    this._ready = new PromiseDelegate<void>();
+    if (this._delayedParse !== null) {
+      window.clearTimeout(this._delayedParse);
+      this._delayedParse = null;
+    }
+
+    this.emitChanged({ type: 'model-reset' });
+  }
+
+  // Parser settings
+  private _delimiter: string;
+  private _quote: string;
+  private _quoteEscaped: RegExp;
+  private _parser: 'quotes' | 'noquotes';
+  private _rowDelimiter: string;
+
+  // Data values
+  private _data: string;
+  private _rowCount: number = 1;
+  private _columnCount: number;
+
+  // Cache information
+  /**
+   * The header strings.
+   */
+  private _header: string[] = [];
+  /**
+   * The column offset cache, starting with row _columnOffsetsStartingRow
+   *
+   * #### Notes
+   * The index of the first character in the data string for row r, column c is
+   * _columnOffsets[(r-this._columnOffsetsStartingRow)*numColumns+c]
+   */
+  private _columnOffsets: Uint32Array = new Uint32Array(0);
+  /**
+   * The row that _columnOffsets[0] represents.
+   */
+  private _columnOffsetsStartingRow: number = 0;
+  /**
+   * The maximum number of rows to parse when there is a cache miss.
+   */
+  private _maxCacheGet: number = 1000;
+  /**
+   * The index for the start of each row.
+   */
+  private _rowOffsets: Uint32Array = new Uint32Array(1);
+  /**
+   * The number of rows to parse initially before doing a delayed parse of the
+   * entire data.
+   */
+  private _initialRows: number;
+
+
+  // Bookkeeping variables.
+  private _delayedParse: number = null;
+  private _doneParsing: boolean = false;
+  private _isDisposed: boolean = false;
+  private _ready = new PromiseDelegate<void>();
+}
+
+
+/**
+ * The namespace for the `DSVModel` class statics.
+ */
+export
+namespace DSVModel {
+
+  /**
+   * An options object for initializing a delimiter-separated data model.
+   */
+  export
+  interface IOptions {
+    /**
+     * The field delimiter, such as ',' or '\t'.
+     *
+     * #### Notes
+     * The field delimiter must be a single character.
+     */
+    delimiter: string;
+
+    /**
+     * The data source for the data model.
+     */
+    data: string;
+
+    /**
+     * Whether the data has a one-row header.
+     */
+    header?: boolean;
+
+    /**
+     * Row delimiter.
+     *
+     * #### Notes
+     * Any carriage return or newline character that is not a delimiter should
+     * be in a quoted field, regardless of the row delimiter setting.
+     */
+    rowDelimiter?: '\r\n' | '\r' | '\n';
+
+    /**
+     * Quote character.
+     *
+     * #### Notes
+     * Quotes are escaped by repeating them, as in RFC 4180. The quote must be a
+     * single character.
+     */
+    quote?: string;
+
+    /**
+     * Whether to use the parser that can handle quoted delimiters.
+     *
+     * #### Notes
+     * Setting this to false uses a much faster parser, but assumes there are
+     * not any field or row delimiters that are quoted in fields. If this is not
+     * set, it defaults to true if any quotes are found in the data, and false
+     * otherwise.
+     */
+    quoteParser?: boolean;
+
+    /**
+     * The maximum number of initial rows to parse before doing a asynchronous
+     * full parse of the data. This should be greater than 0.
+     */
+    initialRows?: number;
+  }
+}

+ 559 - 0
packages/csvviewer/src/parse.ts

@@ -0,0 +1,559 @@
+// Copyright (c) Jupyter Development Team.
+// Distributed under the terms of the Modified BSD License.
+
+/*
+Possible options to add to the parser:
+
+- Optional offsets array to modify, so we don't need to create a new offsets list (we would need to be careful not to overwrite things if a row needs to be truncated.)
+- Comment character at the start of the line
+- Skip empty whitespace lines
+- Skip rows with empty columns
+- Logging an error for too many or too few fields on a line
+- Ignore whitespace around delimiters
+- Add an exported function in this file for getting a field from the returned offsets array (including stripping field or row delimiters and parsing quoted data). Right now this logic is in the DSVModel. Likely we want to keep the logic there for speed, but having it here as well will make the parser more self-contained and usable by others.
+- Sanity check on field size, with an error if the field exceeds the size
+- Tests against https://github.com/maxogden/csv-spectrum
+- Benchmark against https://www.npmjs.com/package/csv-parser and https://www.npmjs.com/package/csv-string and fast-csv.
+
+*/
+
+/**
+ * Interface for a delimiter-separated data parser.
+ *
+ * @param options: The parser options
+ * @returns An object giving the offsets for the rows or columns parsed.
+ *
+ * #### Notes
+ * The parsers are based on [RFC 4180](https://tools.ietf.org/html/rfc4180).
+ */
+export
+type IParser = (options: IParser.IOptions) => IParser.IResults;
+
+export
+namespace IParser {
+  /**
+   * The options for a parser.
+   */
+  export
+  interface IOptions {
+    /**
+     * The data to parse.
+     */
+    data: string;
+
+    /**
+     * Whether to return column offsets in the offsets array.
+     *
+     * #### Notes
+     * If false, the returned offsets array contains just the row offsets. If
+     * true, the returned offsets array contains all column offsets for each
+     * column in the rows (i.e., it has nrows*ncols entries). Individual rows
+     * will have empty columns added or extra columns merged into the last
+     * column if they do not have exactly ncols columns.
+     */
+    columnOffsets: boolean;
+
+    /**
+     * The delimiter to use. Defaults to ','.
+     */
+    delimiter?: string;
+
+    /**
+     * The row delimiter to use. Defaults to '\r\n'.
+     */
+    rowDelimiter?: string;
+
+    /**
+     * The quote character for quoting fields. Defaults to the double quote (").
+     *
+     * #### Notes
+     * As specified in [RFC 4180](https://tools.ietf.org/html/rfc4180), quotes
+     * are escaped in a quoted field by doubling them (for example, "a""b" is the field
+     * a"b).
+     */
+    quote?: string;
+
+    /**
+     * The starting index in the string for processing. Defaults to 0. This
+     * index should be the first character of a new row. This must be less than
+     * data.length.
+     */
+    startIndex?: number;
+
+    /**
+     * Maximum number of rows to parse.
+     *
+     * If this is not given, parsing proceeds to the end of the data.
+     */
+    maxRows?: number;
+
+    /**
+     * Number of columns in each row to parse.
+     *
+     * #### Notes
+     * If this is not given, the ncols defaults to the number of columns in the
+     * first row.
+     */
+    ncols?: number;
+
+  }
+
+  /**
+   * The results from a parser.
+   */
+  export
+  interface IResults {
+    /**
+     * The number of rows parsed.
+     */
+    nrows: number;
+
+    /**
+     * The number of columns parsed, or 0 if only row offsets are returned.
+     */
+    ncols: number;
+
+    /**
+     * The index offsets into the data string for the rows or data items.
+     *
+     * #### Notes
+     * If the columnOffsets argument to the parser is false, the offsets array
+     * will be an array of length nrows, where `offsets[r]` is the index of the
+     * first character of row r.
+     *
+     * If the columnOffsets argument to the parser is true, the offsets array
+     * will be an array of length `nrows*ncols`, where `offsets[r*ncols + c]` is
+     * the index of the first character of the item in row r, column c.
+     */
+    offsets: number[];
+  }
+}
+
+/**
+ * Possible parser states.
+ */
+enum STATE {
+  QUOTED_FIELD,
+  QUOTED_FIELD_QUOTE,
+  UNQUOTED_FIELD,
+  NEW_FIELD,
+  NEW_ROW,
+}
+
+/**
+ * Possible row delimiters for the parser.
+ */
+enum ROW_DELIMITER {
+  CR,
+  CRLF,
+  LF
+}
+
+/**
+ * Parse delimiter-separated data.
+ *
+ * @param options: The parser options
+ * @returns An object giving the offsets for the rows or columns parsed.
+ *
+ * #### Notes
+ * This implementation is based on [RFC 4180](https://tools.ietf.org/html/rfc4180).
+ */
+export
+function parseDSV(options: IParser.IOptions): IParser.IResults {
+  const {
+    data,
+    columnOffsets,
+    delimiter = ',',
+    startIndex = 0,
+    maxRows = 0xFFFFFFFF,
+    rowDelimiter = '\r\n',
+    quote = '"',
+  } = options;
+
+  // ncols will be set automatically if it is undefined.
+  let ncols = options.ncols;
+
+  // The number of rows we've already parsed.
+  let nrows = 0;
+
+  // The row or column offsets we return.
+  let offsets = [];
+
+  // Set up some useful local variables.
+  const CH_DELIMITER = delimiter.charCodeAt(0);
+  const CH_QUOTE = quote.charCodeAt(0);
+  const CH_LF = 10; // \n
+  const CH_CR = 13; // \r
+  const endIndex = data.length;
+  const { QUOTED_FIELD, QUOTED_FIELD_QUOTE, UNQUOTED_FIELD, NEW_FIELD, NEW_ROW } = STATE;
+  const { CR, LF, CRLF } = ROW_DELIMITER;
+  const [rowDelimiterCode, rowDelimiterLength] = (rowDelimiter === '\r\n' ? [CRLF, 2] : (rowDelimiter === '\r' ? [CR, 1] : [LF, 1]));
+
+  // Always start off at the beginning of a row.
+  let state = NEW_ROW;
+
+  // Set up the starting index.
+  let i = startIndex;
+
+  // We initialize to 0 just in case we are asked to parse past the end of the
+  // string. In that case, we want the number of columns to be 0.
+  let col = 0;
+
+  // Declare some useful temporaries
+  let char;
+
+  // Loop through the data string
+  while (i < endIndex) {
+    // i is the index of a character in the state.
+
+    // If we just hit a new row, and there are still characters left, push a new
+    // offset on and reset the column counter. We want this logic at the top of
+    // the while loop rather than the bottom because we don't want a trailing
+    // row delimiter at the end of the data to trigger a new row offset.
+    if (state === NEW_ROW) {
+      // Start a new row and reset the column counter.
+      offsets.push(i);
+      col = 1;
+    }
+
+    // Below, we handle this character, modify the parser state and increment the index to be consistent.
+
+    // Get the integer code for the current character, so the comparisons below
+    // are faster.
+    char = data.charCodeAt(i);
+
+    // Update the parser state. This switch statement is responsible for
+    // updating the state to be consistent with the index i+1 (we increment i
+    // after the switch statement). In some situations, we may increment i
+    // inside this loop to skip over indices as a shortcut.
+    switch (state) {
+
+    // At the beginning of a row or field, we can have a quote, row delimiter, or field delimiter.
+    case NEW_ROW:
+    case NEW_FIELD:
+      switch (char) {
+
+      // If we have a quote, we are starting an escaped field.
+      case CH_QUOTE:
+        state = QUOTED_FIELD;
+        break;
+
+      // A field delimiter means we are starting a new field.
+      case CH_DELIMITER:
+        state = NEW_FIELD;
+        break;
+
+      // A row delimiter means we are starting a new row.
+      case CH_CR:
+        if (rowDelimiterCode === CR) {
+          state = NEW_ROW;
+        } else if (rowDelimiterCode === CRLF && data.charCodeAt(i + 1) === CH_LF) {
+          // If we see an expected \r\n, then increment to the end of the delimiter.
+          i++;
+          state = NEW_ROW;
+        } else {
+          throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${ data.charCodeAt(i + 1)}`;
+        }
+        break;
+      case CH_LF:
+        if (rowDelimiterCode === LF) {
+          state = NEW_ROW;
+        } else {
+          throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
+        }
+        break;
+
+      // Otherwise, we are starting an unquoted field.
+      default:
+        state = UNQUOTED_FIELD;
+        break;
+      }
+      break;
+
+    // We are in a quoted field.
+    case QUOTED_FIELD:
+      // Skip ahead until we see another quote, which either ends the quoted
+      // field or starts an escaped quote.
+      i = data.indexOf(quote, i);
+      if (i < 0) {
+        throw `string index ${i} (in row ${nrows}, column ${col}): mismatched quote`;
+      }
+      state = QUOTED_FIELD_QUOTE;
+      break;
+
+    // We just saw a quote in a quoted field. This could be the end of the
+    // field, or it could be a repeated quote (i.e., an escaped quote according
+    // to RFC 4180).
+    case QUOTED_FIELD_QUOTE:
+      switch (char) {
+      // Another quote means we just saw an escaped quote, so we are still in
+      // the quoted field.
+      case CH_QUOTE:
+        state = QUOTED_FIELD;
+        break;
+
+      // A field or row delimiter means the quoted field just ended and we are
+      // going into a new field or new row.
+      case CH_DELIMITER:
+        state = NEW_FIELD;
+        break;
+
+      // A row delimiter means we are starting a new row in the next index.
+      case CH_CR:
+        if (rowDelimiterCode === CR) {
+          state = NEW_ROW;
+        } else if (rowDelimiterCode === CRLF && data.charCodeAt(i + 1) === CH_LF) {
+          // If we see an expected \r\n, then increment to the end of the delimiter.
+          i++;
+          state = NEW_ROW;
+        } else {
+          throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${ data.charCodeAt(i + 1)}`;
+        }
+        break;
+      case CH_LF:
+        if (rowDelimiterCode === LF) {
+          state = NEW_ROW;
+        } else {
+          throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
+        }
+        break;
+
+      default:
+        throw `string index ${i} (in row ${nrows}, column ${col}): quote in escaped field not followed by quote, delimiter, or row delimiter`;
+      }
+      break;
+
+    // We are in an unquoted field, so the only thing we look for is the next
+    // row or field delimiter.
+    case UNQUOTED_FIELD:
+      // Skip ahead to either the next field delimiter or possible start of a
+      // row delimiter (CR or LF).
+      while (i < endIndex) {
+        char = data.charCodeAt(i);
+        if (char === CH_DELIMITER || char === CH_LF || char === CH_CR) {
+          break;
+        }
+        i++;
+      }
+
+      // Process the character we're seeing in an unquoted field.
+      switch (char) {
+      // A field delimiter means we are starting a new field.
+      case CH_DELIMITER:
+        state = NEW_FIELD;
+        break;
+
+      // A row delimiter means we are starting a new row in the next index.
+      case CH_CR:
+        if (rowDelimiterCode === CR) {
+          state = NEW_ROW;
+        } else if (rowDelimiterCode === CRLF && data.charCodeAt(i + 1) === CH_LF) {
+          // If we see an expected \r\n, then increment to the end of the delimiter.
+          i++;
+          state = NEW_ROW;
+        } else {
+          throw `string index ${i} (in row ${nrows}, column ${col}): carriage return found, but not as part of a row delimiter C ${ data.charCodeAt(i + 1)}`;
+        }
+        break;
+      case CH_LF:
+        if (rowDelimiterCode === LF) {
+          state = NEW_ROW;
+        } else {
+          throw `string index ${i} (in row ${nrows}, column ${col}): line feed found, but row delimiter starts with a carriage return`;
+        }
+        break;
+
+      // Otherwise, we continue on in the unquoted field.
+      default: continue;
+      }
+      break;
+
+    // We should never reach this point since the parser state is handled above,
+    // so throw an error if we do.
+    default:
+      throw `string index ${i} (in row ${nrows}, column ${col}): state not recognized`;
+    }
+
+    // Increment i to the next character index
+    i++;
+
+    // Update return values based on state.
+    switch (state) {
+    case NEW_ROW:
+      nrows++;
+
+      // If we just parsed the first row and the ncols is undefined, set it to
+      // the number of columns we found in the first row.
+      if (nrows === 1 && ncols === undefined) {
+        ncols = col;
+      }
+
+      // Pad or truncate the column offsets in the previous row if we are
+      // returning them.
+      if (columnOffsets === true) {
+        if (col < ncols) {
+          // We didn't have enough columns, so add some more column offsets that
+          // point to just before the row delimiter we just saw.
+          for (; col < ncols; col++) {
+            offsets.push(i - rowDelimiterLength);
+          }
+        } else if (col > ncols) {
+          // We had too many columns, so truncate them.
+          offsets.length = offsets.length - (col - ncols);
+        }
+      }
+
+      // Shortcut return if nrows reaches the maximum rows we are to parse.
+      if (nrows === maxRows) {
+        return {nrows, ncols: columnOffsets ? ncols : 0, offsets};
+      }
+      break;
+
+    case NEW_FIELD:
+      // If we are returning column offsets, log the current index.
+      if (columnOffsets === true) {
+        offsets.push(i);
+      }
+
+      // Update the column counter.
+      col++;
+      break;
+
+    default: break;
+    }
+
+  }
+
+  // If we finished parsing and we are *not* in the NEW_ROW state, then do the
+  // column padding/truncation for the last row. Also make sure ncols is
+  // defined.
+  if (state !== NEW_ROW) {
+    nrows++;
+    if (columnOffsets === true) {
+      // If ncols is *still* undefined, then we only parsed one row and didn't
+      // have a newline, so set it to the number of columns we found.
+      if (ncols === undefined) {
+        ncols = col;
+      }
+
+      if (col < ncols) {
+        // We didn't have enough columns, so add some more column offsets that
+        // point to just before the row delimiter we just saw.
+        for (; col < ncols; col++) {
+          offsets.push(i - (rowDelimiterLength - 1));
+        }
+      } else if (col > ncols) {
+        // We had too many columns, so truncate them.
+        offsets.length = offsets.length - (col - ncols);
+      }
+    }
+  }
+
+
+  return {nrows, ncols: columnOffsets ? ncols : 0, offsets};
+}
+
+
+/**
+ * Parse delimiter-separated data where no delimiter is quoted.
+ *
+ * @param options: The parser options
+ * @returns An object giving the offsets for the rows or columns parsed.
+ *
+ * #### Notes
+ * This function is an optimized parser for cases where there are no field or
+ * row delimiters in quotes. Note that the data can have quotes, but they are
+ * not interpreted in any special way. This implementation is based on [RFC
+ * 4180](https://tools.ietf.org/html/rfc4180), but disregards quotes.
+ */
+export
+function parseDSVNoQuotes(options: IParser.IOptions): IParser.IResults {
+  // Set option defaults.
+  const {
+    data,
+    columnOffsets,
+    delimiter = ',',
+    rowDelimiter = '\r\n',
+    startIndex = 0,
+    maxRows = 0xFFFFFFFF,
+  } = options;
+
+  // ncols will be set automatically if it is undefined.
+  let ncols = options.ncols;
+
+  // Set up our return variables.
+  let offsets: number[] = [];
+  let nrows = 0;
+
+  // Set up various state variables.
+  let rowDelimiterLength = rowDelimiter.length;
+  let currRow = startIndex;
+  let len = data.length;
+  let nextRow: number;
+  let col: number;
+  let rowString: string;
+  let colIndex: number;
+
+  // The end of the current row.
+  let rowEnd: number;
+
+  // Start parsing at the start index.
+  nextRow = startIndex;
+
+  // Loop through rows until we run out of data or we've reached maxRows.
+  while (nextRow !== -1 && nrows < maxRows && currRow < len) {
+    // Store the offset for the beginning of the row and increment the rows.
+    offsets.push(currRow);
+    nrows++;
+
+    // Find the next row delimiter.
+    nextRow = data.indexOf(rowDelimiter, currRow);
+
+    // If the next row delimiter is not found, set the end of the row to the
+    // end of the data string.
+    rowEnd = nextRow === -1 ? len : nextRow;
+
+    // If we are returning column offsets, push them onto the array.
+    if (columnOffsets === true) {
+      // Find the next field delimiter. We slice the current row out so that
+      // the indexOf will stop at the end of the row. It may possibly be faster
+      // to just use a loop to check each character.
+      col = 1;
+      rowString = data.slice(currRow, rowEnd);
+      colIndex = rowString.indexOf(delimiter);
+
+      if (ncols === undefined) {
+        // If we don't know how many columns we need, loop through and find all
+        // of the field delimiters in this row.
+        while (colIndex !== -1) {
+          offsets.push(currRow + colIndex + 1);
+          col++;
+          colIndex = rowString.indexOf(delimiter, colIndex + 1);
+        }
+
+        // Set ncols to the number of fields we found.
+        ncols = col;
+      } else {
+        // If we know the number of columns we expect, find the field delimiters
+        // up to that many columns.
+        while (colIndex !== -1 && col < ncols) {
+          offsets.push(currRow + colIndex + 1);
+          col++;
+          colIndex = rowString.indexOf(delimiter, colIndex + 1);
+        }
+
+        // If we didn't reach the number of columns we expected, pad the offsets
+        // with the offset just before the row delimiter.
+        while (col < ncols) {
+          offsets.push(rowEnd);
+          col++;
+        }
+      }
+    }
+
+    // Skip past the row delimiter at the end of the row.
+    currRow = rowEnd + rowDelimiterLength;
+  }
+
+  return {nrows, ncols: columnOffsets ? ncols : 0, offsets};
+}

+ 13 - 68
packages/csvviewer/src/widget.ts

@@ -1,8 +1,6 @@
 // Copyright (c) Jupyter Development Team.
 // Distributed under the terms of the Modified BSD License.
 
-import * as dsv from 'd3-dsv';
-
 import {
   ActivityMonitor, PathExt
 } from '@jupyterlab/coreutils';
@@ -16,7 +14,7 @@ import {
 } from '@phosphor/coreutils';
 
 import {
-  DataGrid, JSONModel
+  DataGrid
 } from '@phosphor/datagrid';
 
 import {
@@ -31,6 +29,9 @@ import {
   CSVToolbar
 } from './toolbar';
 
+import {
+  DSVModel
+} from './model';
 
 /**
  * The class name added to a CSV viewer.
@@ -71,7 +72,7 @@ class CSVViewer extends Widget implements DocumentRegistry.IReadyWidget {
 
     this._grid = new DataGrid();
     this._grid.addClass(CSV_GRID_CLASS);
-    this._grid.headerVisibility = 'column';
+    this._grid.headerVisibility = 'all';
 
     this._toolbar = new CSVToolbar({ selected: this._delimiter });
     this._toolbar.delimiterChanged.connect(this._onDelimiterChanged, this);
@@ -142,13 +143,16 @@ class CSVViewer extends Widget implements DocumentRegistry.IReadyWidget {
   }
 
   /**
-   * Create the json model for the grid.
+   * Create the model for the grid.
    */
   private _updateGrid(): void {
-    let text = this._context.model.toString();
-    let [columns, data] = Private.parse(text, this._delimiter);
-    let fields = columns.map(name => ({ name, type: 'string' }));
-    this._grid.model = new JSONModel({ data, schema: { fields } });
+    let data: string = this._context.model.toString();
+    let delimiter = this._delimiter;
+    let oldModel = this._grid.model as DSVModel;
+    this._grid.model = new DSVModel({ data, delimiter });
+    if (oldModel) {
+      oldModel.dispose();
+    }
   }
 
   private _context: DocumentRegistry.Context;
@@ -190,62 +194,3 @@ class CSVViewerFactory extends ABCWidgetFactory<CSVViewer, DocumentRegistry.IMod
     return new CSVViewer({ context });
   }
 }
-
-
-/**
- * The namespace for the module implementation details.
- */
-namespace Private {
-  /**
-   * Parse DSV text with the given delimiter.
-   *
-   * @param text - The DSV text to parse.
-   *
-   * @param delimiter - The delimiter for parsing.
-   *
-   * @returns A tuple of `[columnNames, dataRows]`
-   */
-  export
-  function parse(text: string, delimiter: string): [string[], dsv.DSVRowString[]] {
-    let columns: string[] = [];
-    let rowFn: RowFn | null = null;
-    let rows = dsv.dsvFormat(delimiter).parseRows(text, row => {
-      if (rowFn) {
-        return rowFn(row);
-      }
-      columns = uniquifyColumns(row);
-      rowFn = makeRowFn(columns);
-    });
-    return [columns, rows];
-  }
-
-  /**
-   * Replace duplicate column names with unique substitutes.
-   */
-  function uniquifyColumns(columns: string[]): string[] {
-    let unique: string[] = [];
-    let set: { [key: string]: boolean } = Object.create(null);
-    for (let name of columns) {
-      let uniqueName = name;
-      for (let i = 1; uniqueName in set; ++i) {
-        uniqueName = `${name}.${i}`;
-      }
-      set[uniqueName] = true;
-      unique.push(uniqueName);
-    }
-    return unique;
-  }
-
-  /**
-   * A type alias for a row conversion function.
-   */
-  type RowFn = (r: string[]) => dsv.DSVRowString;
-
-  /**
-   * Create a row conversion function for the given column names.
-   */
-  function makeRowFn(columns: string[]): RowFn {
-    let pairs = columns.map((name, i) => `'${name.replace(/'/g, '\\\'')}':r[${i}]`).join(',');
-    return (new Function('r', `return {${pairs}};`)) as RowFn;
-  }
-}

+ 1 - 0
tests/test-csvviewer/package.json

@@ -20,6 +20,7 @@
     "@jupyterlab/docregistry": "^0.15.5",
     "@jupyterlab/services": "^1.1.4",
     "@phosphor/widgets": "^1.5.0",
+    "csv-spectrum": "~1.0.0",
     "expect.js": "~0.3.1",
     "simulate-event": "~1.4.0"
   },

+ 207 - 0
tests/test-csvviewer/src/model.spec.ts

@@ -0,0 +1,207 @@
+// Copyright (c) Jupyter Development Team.
+// Distributed under the terms of the Modified BSD License.
+
+import expect = require('expect.js');
+
+import {
+  DSVModel
+} from '@jupyterlab/csvviewer';
+
+
+/* tslint:disable:no-var-requires */
+const CSV_TEST_FILES = [
+  ['comma_in_quotes',
+   require('csv-spectrum/csvs/comma_in_quotes.csv'),
+   require('csv-spectrum/json/comma_in_quotes.json')],
+
+  ['empty',
+   require('csv-spectrum/csvs/empty.csv'),
+   require('csv-spectrum/json/empty.json')],
+
+  ['empty_crlf',
+   require('csv-spectrum/csvs/empty_crlf.csv'),
+   require('csv-spectrum/json/empty_crlf.json')],
+
+  ['escaped_quotes',
+   require('csv-spectrum/csvs/escaped_quotes.csv'),
+   require('csv-spectrum/json/escaped_quotes.json')],
+
+  ['json',
+   require('csv-spectrum/csvs/json.csv'),
+   require('csv-spectrum/json/json.json')],
+
+  ['newlines',
+   require('csv-spectrum/csvs/newlines.csv'),
+   require('csv-spectrum/json/newlines.json')],
+
+  ['newlines_crlf',
+   require('csv-spectrum/csvs/newlines_crlf.csv'),
+   require('csv-spectrum/json/newlines_crlf.json')],
+
+  ['quotes_and_newlines',
+   require('csv-spectrum/csvs/quotes_and_newlines.csv'),
+   require('csv-spectrum/json/quotes_and_newlines.json')],
+
+  ['simple',
+   require('csv-spectrum/csvs/simple.csv'),
+   require('csv-spectrum/json/simple.json')],
+
+  ['simple_crlf',
+   require('csv-spectrum/csvs/simple_crlf.csv'),
+   require('csv-spectrum/json/simple_crlf.json')],
+
+  ['utf8',
+   require('csv-spectrum/csvs/utf8.csv'),
+   require('csv-spectrum/json/utf8.json')]
+];
+/* tslint:enable:no-var-requires */
+
+
+describe('csvviewer/model', () => {
+
+  describe('DSVModel', () => {
+
+    describe('#constructor()', () => {
+
+      it('should instantiate a `DSVModel`', () => {
+        let d = new DSVModel({data: 'a,b,c\nd,e,f\n', delimiter: ','});
+        expect(d.rowCount('column-header')).to.be(1);
+        expect(d.rowCount('body')).to.be(1);
+        expect(d.columnCount('row-header')).to.be(1);
+        expect(d.columnCount('body')).to.be(3);
+        expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+        expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d', 'e', 'f']);
+      });
+
+    });
+
+    it('parses a number of test files correctly', () => {
+      for (let [ , csv, answer] of CSV_TEST_FILES) {
+        let d = new DSVModel({data: csv, delimiter: ','});
+        let labels = [];
+        for (let i = 0; i < d.columnCount('body'); i++) {
+          labels.push(d.data('column-header', 0, i));
+        }
+        let values = [];
+        for (let r = 0; r < d.rowCount('body'); r++) {
+          let row: {[key: string]: string} = {};
+          for (let c = 0; c < d.columnCount('body'); c++) {
+            row[labels[c]] = d.data('body', r, c);
+          }
+          values.push(row);
+        }
+        expect(values).to.eql(answer);
+      }
+    });
+
+    it('handles tab-separated data', () => {
+      let d = new DSVModel({data: 'a\tb\tc\nd\te\tf\n', delimiter: '\t'});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d', 'e', 'f']);
+
+    });
+
+    it('handles not having a header', () => {
+      let d = new DSVModel({data: 'a,b,c\nd,e,f\n', delimiter: ',', header: false});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(2);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['1', '2', '3']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 1, i))).to.eql(['d', 'e', 'f']);
+    });
+
+    it('handles CRLF row delimiter', () => {
+      let d = new DSVModel({data: 'a,b,c\r\nd,e,f\r\n', delimiter: ',', rowDelimiter: '\r\n'});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d', 'e', 'f']);
+    });
+
+    it('handles CR row delimiter', () => {
+      let d = new DSVModel({data: 'a,b,c\rd,e,f\r', delimiter: ',', rowDelimiter: '\r'});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d', 'e', 'f']);
+    });
+
+    it('can guess the row delimiter', () => {
+      let d = new DSVModel({data: 'a,b,c\rd,e,f\r', delimiter: ','});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d', 'e', 'f']);
+    });
+
+    it('handles a given quote character', () => {
+      let d = new DSVModel({data: `a,'b','c'\r'd',e,'f'\r`, delimiter: ',', quote: `'`});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d', 'e', 'f']);
+    });
+
+    it('handles delimiters and quotes inside quotes', () => {
+      let d = new DSVModel({data: `'a\rx',b,'c''x'\r'd,x',e,'f'\r`, delimiter: ',', quote: `'`, rowDelimiter: '\r'});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a\rx', 'b', `c'x`]);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['d,x', 'e', 'f']);
+    });
+
+    it('handles rows that are too short or too long', () => {
+      let d = new DSVModel({data: `a,b,c\n,c,d,e,f\ng,h`, delimiter: ','});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(2);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['', 'c', 'd,e,f']);
+      expect([0, 1, 2].map(i => d.data('body', 1, i))).to.eql(['g', 'h', '']);
+    });
+
+    it('handles delayed parsing of rows past the initial rows', () => {
+      let d = new DSVModel({data: `a,b,c\nc,d,e\nf,g,h\ni,j,k`, delimiter: ',', initialRows: 2});
+      expect(d.rowCount('column-header')).to.be(1);
+      expect(d.rowCount('body')).to.be(1);
+      expect(d.columnCount('row-header')).to.be(1);
+      expect(d.columnCount('body')).to.be(3);
+      expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+
+      // Expected behavior is that all unparsed data is lumped into the final field.
+      expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['c', 'd', 'e\nf,g,h\ni,j,k']);
+
+      // Check everything is in order after all the data has been parsed asynchronously.
+      return d.ready.then(() => {
+        expect(d.rowCount('column-header')).to.be(1);
+        expect(d.rowCount('body')).to.be(3);
+        expect(d.columnCount('row-header')).to.be(1);
+        expect(d.columnCount('body')).to.be(3);
+        expect([0, 1, 2].map(i => d.data('column-header', 0, i))).to.eql(['a', 'b', 'c']);
+        expect([0, 1, 2].map(i => d.data('body', 0, i))).to.eql(['c', 'd', 'e']);
+        expect([0, 1, 2].map(i => d.data('body', 1, i))).to.eql(['f', 'g', 'h']);
+        expect([0, 1, 2].map(i => d.data('body', 2, i))).to.eql(['i', 'j', 'k']);
+      });
+
+    });
+
+  });
+
+});

+ 234 - 0
tests/test-csvviewer/src/parse-noquotes.spec.ts

@@ -0,0 +1,234 @@
+// Copyright (c) Jupyter Development Team.
+// Distributed under the terms of the Modified BSD License.
+
+import expect = require('expect.js');
+
+import {
+  parseDSVNoQuotes as parser
+} from '@jupyterlab/csvviewer';
+
+
+describe('csvviewer/parsenoquotes', () => {
+
+  describe('parseDSVNoQuotes', () => {
+
+    it('does basic parsing of csv files', () => {
+      let data = `a,b,c,d\r\n0,1,2,3\r\n4,5,6,7`;
+      let options = {data};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(0);
+      expect(results.offsets).to.eql([0, 9, 18]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 24]);
+    });
+
+    // For simplicity, we'll use \n as a row delimiter below.
+
+    it('handles trailing row delimiter', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 16]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]);
+    });
+
+    it('handles changing the field delimiter', () => {
+      let data = `a\tb\tc\td\n0\t1\t2\t3\n4\t5\t6\t7\n`;
+      let options = {data, delimiter: '\t', rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 16]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]);
+    });
+
+    it('handles starting on a new row', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n', startIndex: 8};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([8, 16]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([8, 10, 12, 14, 16, 18, 20, 22]);
+    });
+
+    it('handles a max row argument', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n', maxRows: 2};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([0, 8]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 12, 14]);
+    });
+
+    it('handles a start index and max row argument', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n', startIndex: 8, maxRows: 1};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([8]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([8, 10, 12, 14]);
+    });
+
+    it('adjusts columns to match first row by default', () => {
+      let data = `a,b,c,d\n0,\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 11]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 10, 10, 11, 13, 15, 17]);
+    });
+
+    it('adjusts columns to match first row by default with CRLF row delimiter', () => {
+      let data = `a,b,c,d\r\n0,\r\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\r\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 9, 13]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([ 0, 2, 4, 6, 9, 11, 11, 11, 13, 15, 17, 19 ]);
+    });
+
+    it('adjusts columns to match ncols', () => {
+      let data = `a,b,c,d\n0,\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\n', ncols: 5};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 11]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(5);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 8, 10, 10, 10, 10, 11, 13, 15, 17, 19]);
+    });
+
+    it('adjusts columns to match ncols with CRLF row delimiter', () => {
+      let data = `a,b,c,d\r\n0,\r\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\r\n', ncols: 5};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 9, 13]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(5);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 9, 11, 11, 11, 11, 13, 15, 17, 19, 21]);
+    });
+
+    it('adjusts columns to match ncols with one row', () => {
+      let data = `a,b,c,d`;
+      let options = {data, rowDelimiter: '\n', ncols: 7};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(7);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 7, 7]);
+    });
+
+    it('adjusts columns to match ncols with one row and trailing delimiter', () => {
+      let data = `a,b,c,d\n`;
+      let options = {data, rowDelimiter: '\n', ncols: 7};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(7);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 7, 7]);
+    });
+
+    it('handles a single row delimiter', () => {
+      let data = `\n`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+    });
+
+    it('handles adding columns or merging columns as necessary', () => {
+      let data = `a,b,c\n,c,d,e,f\ng,h`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 6, 15]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 9, 15, 17, 18]);
+    });
+
+  });
+});
+
+// Helpful debugging logging
+// console.log(Array.from(results.offsets));
+// console.log(Array.from(results.offsets).map(i => data[i]));
+// console.log(Array.from(results.offsets).map((i, ind, arr) => data.slice(i, arr[ind + 1])));

+ 341 - 0
tests/test-csvviewer/src/parse.spec.ts

@@ -0,0 +1,341 @@
+// Copyright (c) Jupyter Development Team.
+// Distributed under the terms of the Modified BSD License.
+
+import expect = require('expect.js');
+
+import {
+  parseDSV as parser
+} from '@jupyterlab/csvviewer';
+
+describe('csvviewer/parse', () => {
+
+  describe('parseDSV', () => {
+
+    it('does basic parsing of csv files', () => {
+      let data = `a,b,c,d\r\n0,1,2,3\r\n4,5,6,7`;
+      let options = {data};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(0);
+      expect(results.offsets).to.eql([0, 9, 18]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 9, 11, 13, 15, 18, 20, 22, 24]);
+    });
+
+    // For simplicity, we'll use \n as a row delimiter below.
+
+    it('handles trailing row delimiter', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 16]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]);
+    });
+
+    it('handles changing the field delimiter', () => {
+      let data = `a\tb\tc\td\n0\t1\t2\t3\n4\t5\t6\t7\n`;
+      let options = {data, delimiter: '\t', rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 16]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22]);
+    });
+
+    it('handles starting on a new row', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n', startIndex: 8};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([8, 16]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([8, 10, 12, 14, 16, 18, 20, 22]);
+    });
+
+    it('handles a max row argument', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n', maxRows: 2};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([0, 8]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 12, 14]);
+    });
+
+    it('handles a start index and max row argument', () => {
+      let data = `a,b,c,d\n0,1,2,3\n4,5,6,7\n`;
+      let options = {data, rowDelimiter: '\n', startIndex: 8, maxRows: 1};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([8]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([8, 10, 12, 14]);
+    });
+
+    it('adjusts columns to match first row by default', () => {
+      let data = `a,b,c,d\n0,\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 11]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 8, 10, 10, 10, 11, 13, 15, 17]);
+    });
+
+    it('adjusts columns to match first row by default with CRLF row delimiter', () => {
+      let data = `a,b,c,d\r\n0,\r\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\r\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 9, 13]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(4);
+      expect(results.offsets).to.eql([ 0, 2, 4, 6, 9, 11, 11, 11, 13, 15, 17, 19 ]);
+    });
+
+    it('adjusts columns to match ncols', () => {
+      let data = `a,b,c,d\n0,\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\n', ncols: 5};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 8, 11]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(5);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 8, 10, 10, 10, 10, 11, 13, 15, 17, 19]);
+    });
+
+    it('adjusts columns to match ncols with CRLF row delimiter', () => {
+      let data = `a,b,c,d\r\n0,\r\n1,2,3,4,5,6`;
+      let options = {data, rowDelimiter: '\r\n', ncols: 5};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 9, 13]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(5);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 9, 11, 11, 11, 11, 13, 15, 17, 19, 21]);
+    });
+
+    it('adjusts columns to match ncols with one row', () => {
+      let data = `a,b,c,d`;
+      let options = {data, rowDelimiter: '\n', ncols: 7};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(7);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 7, 7]);
+    });
+
+    it('adjusts columns to match ncols with one row and trailing delimiter', () => {
+      let data = `a,b,c,d\n`;
+      let options = {data, rowDelimiter: '\n', ncols: 7};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(7);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 7, 7]);
+    });
+
+    it('handles a single row delimiter', () => {
+      let data = `\n`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+    });
+
+    it('handles adding columns or merging columns as necessary', () => {
+      let data = `a,b,c\n,c,d,e,f\ng,h`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(3);
+      expect(results.offsets).to.eql([0, 6, 15]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(3);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 4, 6, 7, 9, 15, 17, 18]);
+    });
+
+  });
+
+  describe('parseDSV quotes', () => {
+
+    it('does basic parsing of quoted csv files', () => {
+      let data = `first,"last",address,city,zip`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(5);
+      expect(results.offsets).to.eql([0, 6, 13, 21, 26]);
+    });
+
+    it('handles quotes with field delimiters', () => {
+      let data = `a,"b,c",d\n"e","f"`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([0, 10]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 8, 10, 14, 17]);
+    });
+
+    it('handles quotes with row delimiters', () => {
+      let data = `a,"b\nc",d\ne,f`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([0, 10]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 8, 10, 12, 13]);
+    });
+
+    it('handles quotes with escaped quotes', () => {
+      let data = `a,"b""c",d\ne,f`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([0, 11]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 9, 11, 13, 14]);
+    });
+
+    it('handles setting the quote character', () => {
+      let data = `a,'b'',\nc',d\ne,f`;
+      let options = {data, rowDelimiter: '\n', quote: `'`};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(2);
+      expect(results.offsets).to.eql([0, 13]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(2);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 11, 13, 15, 16]);
+    });
+
+    it('handles single quoted field', () => {
+      let data = `"a"`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+    });
+
+    it('handles empty quoted field', () => {
+      let data = `a,"",b`;
+      let options = {data, rowDelimiter: '\n'};
+      let results;
+
+      results = parser({...options, columnOffsets: false});
+      expect(results.nrows).to.eql(1);
+      expect(results.offsets).to.eql([0]);
+
+      results = parser({...options, columnOffsets: true});
+      expect(results.nrows).to.eql(1);
+      expect(results.ncols).to.eql(3);
+      expect(results.offsets).to.eql([0, 2, 5]);
+    });
+
+  });
+});
+
+// Helpful debugging logging
+// console.log(Array.from(results.offsets));
+// console.log(Array.from(results.offsets).map((i, ind, arr) => data.slice(i, arr[ind + 1])));

+ 1 - 0
tests/webpack.config.js

@@ -44,6 +44,7 @@ module.exports = {
         exclude: path.join(process.cwd(), 'node_modules')
       },
       { test: /\.css$/, use: ['style-loader', 'css-loader'] },
+      { test: /\.csv$/, use: 'raw-loader' },
       { test: /\.(json|ipynb)$/, use: 'json-loader' },
       { test: /\.html$/, use: 'file-loader' },
       { test: /\.md$/, use: 'raw-loader' },

+ 5 - 5
yarn.lock

@@ -128,10 +128,6 @@
   version "1.1.1"
   resolved "https://registry.npmjs.org/@types/comment-json/-/comment-json-1.1.1.tgz#b4ae889912a93e64619f97989aecaff8ce889dca"
 
-"@types/d3-dsv@~1.0.30":
-  version "1.0.31"
-  resolved "https://registry.npmjs.org/@types/d3-dsv/-/d3-dsv-1.0.31.tgz#468302f18ac44db2a3944086388d862503ab9c6c"
-
 "@types/events@*":
   version "1.2.0"
   resolved "https://registry.npmjs.org/@types/events/-/events-1.2.0.tgz#81a6731ce4df43619e5c8c945383b3e62a89ea86"
@@ -2135,6 +2131,10 @@ csso@~2.3.1:
     clap "^1.0.9"
     source-map "^0.5.3"
 
+csv-spectrum@~1.0.0:
+  version "1.0.0"
+  resolved "https://registry.npmjs.org/csv-spectrum/-/csv-spectrum-1.0.0.tgz#591ac9ff48ad4f3eb4338457bc9801b349e3d628"
+
 currently-unhandled@^0.4.1:
   version "0.4.1"
   resolved "https://registry.npmjs.org/currently-unhandled/-/currently-unhandled-0.4.1.tgz#988df33feab191ef799a61369dd76c17adf957ea"
@@ -2171,7 +2171,7 @@ d3-dispatch@1:
   version "1.0.3"
   resolved "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-1.0.3.tgz#46e1491eaa9b58c358fce5be4e8bed626e7871f8"
 
-d3-dsv@1, d3-dsv@~1.0.7:
+d3-dsv@1:
   version "1.0.8"
   resolved "https://registry.npmjs.org/d3-dsv/-/d3-dsv-1.0.8.tgz#907e240d57b386618dc56468bacfe76bf19764ae"
   dependencies: