rewrite csv handling, fix parsing bugs, remove newlines option, improve performance

This commit is contained in:
Ben Fry
2016-07-25 21:52:08 -04:00
parent d00bff1307
commit db2bcb4f44

View File

@@ -325,9 +325,7 @@ public class Table {
protected void parse(InputStream input, String options) throws IOException {
//init();
boolean awfulCSV = false;
// boolean awfulCSV = false;
boolean header = false;
String extension = null;
boolean binary = false;
@@ -347,8 +345,9 @@ public class Table {
} else if (opt.equals("ods")) {
extension = "ods";
} else if (opt.equals("newlines")) {
awfulCSV = true;
extension = "csv";
//awfulCSV = true;
//extension = "csv";
throw new IllegalArgumentException("The 'newlines' option is no longer necessary.");
} else if (opt.equals("bin")) {
binary = true;
extension = "bin";
@@ -379,13 +378,16 @@ public class Table {
} else {
InputStreamReader isr = new InputStreamReader(input, encoding);
BufferedReader reader = new BufferedReader(isr);
if (awfulCSV) {
/*
if (awfulCSV) {
parseAwfulCSV(reader, header);
} else if ("tsv".equals(extension)) {
parseBasic(reader, header, true);
} else if ("csv".equals(extension)) {
parseBasic(reader, header, false);
}
*/
parseBasic(reader, header, "tsv".equals(extension));
}
}
@@ -404,16 +406,16 @@ public class Table {
setRowCount(row << 1);
}
if (row == 0 && header) {
setColumnTitles(tsv ? PApplet.split(line, '\t') : splitLineCSV(line));
setColumnTitles(tsv ? PApplet.split(line, '\t') : splitLineCSV(line, reader));
header = false;
} else {
setRow(row, tsv ? PApplet.split(line, '\t') : splitLineCSV(line));
setRow(row, tsv ? PApplet.split(line, '\t') : splitLineCSV(line, reader));
row++;
}
// this is problematic unless we're going to calculate rowCount first
if (row % 10000 == 0) {
/*
// this is problematic unless we're going to calculate rowCount first
if (row < rowCount) {
int pct = (100 * row) / rowCount;
if (pct != prev) { // also prevents "0%" from showing up
@@ -445,6 +447,7 @@ public class Table {
// }
/*
protected void parseAwfulCSV(BufferedReader reader,
boolean header) throws IOException {
char[] c = new char[100];
@@ -542,8 +545,187 @@ public class Table {
setRowCount(row); // shrink to the actual size
}
}
*/
class CommaSeparatedLine {
char[] c;
String[] pieces;
int pieceCount;
// int offset;
int start; //, stop;
String[] handle(String line, BufferedReader reader) throws IOException {
// PApplet.println("handle() called for: " + line);
start = 0;
pieceCount = 0;
c = line.toCharArray();
// get tally of number of columns and allocate the array
int cols = 1; // the first comma indicates the second column
boolean quote = false;
for (int i = 0; i < c.length; i++) {
if (!quote && (c[i] == ',')) {
cols++;
} else if (c[i] == '\"') {
// double double quotes (escaped quotes like "") will simply toggle
// this back and forth, so it should remain accurate
quote = !quote;
}
}
pieces = new String[cols];
// while (offset < c.length) {
// start = offset;
while (start < c.length) {
boolean enough = ingest();
while (!enough) {
// found a newline inside the quote, grab another line
String nextLine = reader.readLine();
// System.out.println("extending to " + nextLine);
if (nextLine == null) {
// System.err.println(line);
throw new IOException("Found a quoted line that wasn't terminated properly.");
}
// for simplicity, not bothering to skip what's already been read
// from c (and reset the offset to 0), opting to make a bigger array
// with both lines.
char[] temp = new char[c.length + 1 + nextLine.length()];
PApplet.arrayCopy(c, temp, c.length);
// NOTE: we're converting to \n here, which isn't perfect
temp[c.length] = '\n';
nextLine.getChars(0, nextLine.length(), temp, c.length + 1);
// c = temp;
return handle(new String(temp), reader);
//System.out.println(" full line is now " + new String(c));
//stop = nextComma(c, offset);
//System.out.println("stop is now " + stop);
//enough = ingest();
}
}
// Make any remaining entries blanks instead of nulls. Empty columns from
// CSV are always "" not null, so this handles successive commas in a line
for (int i = pieceCount; i < pieces.length; i++) {
pieces[i] = "";
}
// PApplet.printArray(pieces);
return pieces;
}
protected void addPiece(int start, int stop, boolean quotes) {
if (quotes) {
int dest = start;
for (int i = start; i < stop; i++) {
if (c[i] == '\"') {
++i; // step over the quote
}
if (i != dest) {
c[dest] = c[i];
}
dest++;
}
pieces[pieceCount++] = new String(c, start, dest - start);
} else {
pieces[pieceCount++] = new String(c, start, stop - start);
}
}
/**
* Returns the next comma (not inside a quote) in the specified array.
* @param c array to search
* @param index offset at which to start looking
* @return index of the comma, or -1 if line ended inside an unclosed quote
*/
protected boolean ingest() {
boolean hasEscapedQuotes = false;
// not possible
// if (index == c.length) { // we're already at the end
// return c.length;
// }
boolean quoted = c[start] == '\"';
if (quoted) {
start++; // step over the quote
}
int i = start;
while (i < c.length) {
// PApplet.println(c[i] + " i=" + i);
if (c[i] == '\"') {
// if this fella started with a quote
if (quoted) {
if (i == c.length-1) {
// closing quote for field; last field on the line
addPiece(start, i, hasEscapedQuotes);
start = c.length;
return true;
} else if (c[i+1] == '\"') {
// an escaped quote inside a quoted field, step over it
hasEscapedQuotes = true;
i += 2;
} else if (c[i+1] == ',') {
// that was our closing quote, get outta here
addPiece(start, i, hasEscapedQuotes);
start = i+2;
return true;
}
} else { // not a quoted line
if (i == c.length-1) {
// we're at the end of the line, can't have an unescaped quote
throw new RuntimeException("Unterminated quote at end of line");
} else if (c[i+1] == '\"') {
// step over this crummy quote escape
hasEscapedQuotes = true;
i += 2;
} else {
throw new RuntimeException("Unterminated quoted field mid-line");
}
}
} else if (!quoted && c[i] == ',') {
addPiece(start, i, hasEscapedQuotes);
start = i+1;
return true;
} else if (!quoted && i == c.length-1) {
addPiece(start, c.length, hasEscapedQuotes);
start = c.length;
return true;
} else { // nothing all that interesting
i++;
}
}
// if (!quote && (c[i] == ',')) {
// // found a comma, return this location
// return i;
// } else if (c[i] == '\"') {
// // if it's a quote, then either the next char is another quote,
// // or if this is a quoted entry, it better be a comma
// quote = !quote;
// }
// }
// if still inside a quote, indicate that another line should be read
if (quoted) {
return false;
}
// // made it to the end of the array with no new comma
// return c.length;
throw new RuntimeException("not sure how...");
}
}
CommaSeparatedLine csl;
/**
* Parse a line of text as comma-separated values, returning each value as
* one entry in an array of String objects. Remove quotes from entries that
@@ -551,23 +733,53 @@ public class Table {
* @param line line of text to be parsed
* @return an array of the individual values formerly separated by commas
*/
static protected String[] splitLineCSV(String line) {
protected String[] splitLineCSV(String line, BufferedReader reader) throws IOException {
if (csl == null) {
csl = new CommaSeparatedLine();
}
return csl.handle(line, reader);
}
/*
static protected String[] splitLineCSV(String line, BufferedReader reader) throws IOException {
char[] c = line.toCharArray();
int rough = 1; // at least one
// get tally of number of columns and allocate the array
int cols = 1; // the first comma indicates the second column
boolean quote = false;
for (int i = 0; i < c.length; i++) {
if (!quote && (c[i] == ',')) {
rough++;
cols++;
} else if (c[i] == '\"') {
// double double quotes (escaped quotes like "") will simply toggle
// this back and forth, so it should remain accurate
quote = !quote;
}
}
String[] pieces = new String[rough];
String[] pieces = new String[cols];
// now do actual parsing
int pieceCount = 0;
int offset = 0;
while (offset < c.length) {
int start = offset;
int stop = nextComma(c, offset);
while (stop == -1) {
// found a newline inside the quote, grab another line
String nextLine = reader.readLine();
System.out.println("extending to " + nextLine);
if (nextLine == null) {
System.err.println(line);
throw new IOException("Found a quoted line that wasn't terminated properly.");
}
char[] temp = new char[c.length + 1 + nextLine.length()];
PApplet.arrayCopy(c, temp, c.length);
// NOTE: we're converting to \n here, which isn't perfect
temp[c.length] = '\n';
line.getChars(0, nextLine.length(), temp, c.length + 1);
c = temp;
stop = nextComma(c, offset);
System.out.println("stop is now " + stop);
}
offset = stop + 1; // next time around, need to step over the comment
if (c[start] == '\"' && c[stop-1] == '\"') {
start++;
@@ -588,26 +800,80 @@ public class Table {
String s = new String(c, start, ii - start);
pieces[pieceCount++] = s;
}
// make any remaining entries blanks instead of nulls
// Make any remaining entries blanks instead of nulls. Empty columns from
// CSV are always "" not null, so this handles successive commas in a line
for (int i = pieceCount; i < pieces.length; i++) {
pieces[i] = "";
}
return pieces;
}
*/
/**
* Returns the next comma (not inside a quote) in the specified array.
* @param c array to search
* @param index offset at which to start looking
* @return index of the comma, or -1 if line ended inside an unclosed quote
*/
/*
static protected int nextComma(char[] c, int index) {
boolean quote = false;
if (index == c.length) { // we're already at the end
return c.length;
}
boolean quoted = c[index] == '\"';
if (quoted) {
index++; // step over the quote
}
for (int i = index; i < c.length; i++) {
if (c[i] == '\"') {
// if this fella started with a quote
if (quoted) {
if (i == c.length-1) {
//return -1; // ran out of chars
// closing quote for field; last field on the line
return c.length;
} else if (c[i+1] == '\"') {
// an escaped quote inside a quoted field, step over it
i++;
} else if (c[i+1] == ',') {
// that's our closing quote, get outta here
return i+1;
}
} else { // not a quoted line
if (i == c.length-1) {
// we're at the end of the line, can't have an unescaped quote
//return -1; // ran out of chars
throw new RuntimeException("Unterminated quoted field at end of line");
} else if (c[i+1] == '\"') {
// step over this crummy quote escape
++i;
} else {
throw new RuntimeException("Unterminated quoted field mid-line");
}
}
} else if (!quoted && c[i] == ',') {
return i;
}
if (!quote && (c[i] == ',')) {
// found a comma, return this location
return i;
} else if (c[i] == '\"') {
// if it's a quote, then either the next char is another quote,
// or if this is a quoted entry, it better be a comma
quote = !quote;
}
}
// if still inside a quote, indicate that another line should be read
if (quote) {
return -1;
}
// made it to the end of the array with no new comma
return c.length;
}
*/
/**
@@ -4429,7 +4695,7 @@ public class Table {
int prev = -1;
int row = 0;
while ((line = reader.readLine()) != null) {
convertRow(output, tsv ? PApplet.split(line, '\t') : splitLineCSV(line));
convertRow(output, tsv ? PApplet.split(line, '\t') : splitLineCSV(line, reader));
row++;
if (row % 10000 == 0) {