Node.js 스트림으로 대용량 CSV 파싱 최적화

문제 상황

데이터 분석팀에서 10GB 크기의 CSV 파일을 DB로 마이그레이션하는 작업을 요청받았다. 처음엔 csv-parser 라이브러리로 전체 파일을 읽어서 처리하려 했는데, 5GB쯤에서 heap out of memory 에러가 발생했다.

const fs = require('fs');
const csv = require('csv-parser');
const results = [];

fs.createReadStream('data.csv')
  .pipe(csv())
  .on('data', (data) => results.push(data)) // 메모리에 전부 적재
  .on('end', () => {
    // 이 시점에 이미 메모리 초과
  });

스트림 기반 처리로 전환

배열에 쌓지 않고 청크 단위로 DB에 바로 쓰는 방식으로 변경했다. stream.Transform을 활용해서 배치 처리 로직을 구현했다.

const { Transform } = require('stream');
const csv = require('csv-parser');

class BatchWriter extends Transform {
  constructor(batchSize = 1000) {
    super({ objectMode: true });
    this.batch = [];
    this.batchSize = batchSize;
  }

  _transform(chunk, encoding, callback) {
    this.batch.push(chunk);
    if (this.batch.length >= this.batchSize) {
      this.writeBatch();
    }
    callback();
  }

  _flush(callback) {
    if (this.batch.length > 0) {
      this.writeBatch();
    }
    callback();
  }

  writeBatch() {
    // Bulk insert 실행
    db.batchInsert(this.batch);
    this.batch = [];
  }
}

fs.createReadStream('data.csv')
  .pipe(csv())
  .pipe(new BatchWriter(1000));

결과

메모리 사용량이 약 8GB에서 800MB로 줄었고, 처리 시간도 30% 정도 단축됐다. backpressure 처리를 위해 highWaterMark 옵션도 조정했는데, 16KB 기본값보다 64KB로 올렸을 때 성능이 더 좋았다.

스트림은 Node.js의 핵심이지만 평소에 잘 안 쓰게 되는데, 대용량 데이터 처리에선 필수였다. 다음엔 에러 핸들링과 재시도 로직도 추가해볼 예정이다.