Node.js 스트림을 활용한 대용량 CSV 파일 처리

문제 상황

재택근무로 전환된 후 첫 주, 데이터팀에서 100GB가 넘는 CSV 파일을 파싱해서 DB에 넣어달라는 요청이 들어왔다. 기존 코드는 fs.readFileSync로 전체 파일을 메모리에 올린 후 처리하는 방식이었고, 당연히 메모리 부족으로 프로세스가 죽었다.

스트림 기반 처리

Node.js의 Stream API를 사용해서 청크 단위로 읽고 처리하도록 변경했다.

const fs = require('fs');
const readline = require('readline');
const { Pool } = require('pg');

const pool = new Pool({ connectionString: process.env.DATABASE_URL });

async function processCsvStream(filePath) {
  const fileStream = fs.createReadStream(filePath);
  const rl = readline.createInterface({
    input: fileStream,
    crlfDelay: Infinity
  });

  let batch = [];
  const BATCH_SIZE = 1000;

  for await (const line of rl) {
    const row = parseCsvLine(line);
    batch.push(row);

    if (batch.length >= BATCH_SIZE) {
      await insertBatch(batch);
      batch = [];
    }
  }

  if (batch.length > 0) {
    await insertBatch(batch);
  }
}

async function insertBatch(rows) {
  const client = await pool.connect();
  try {
    await client.query('BEGIN');
    for (const row of rows) {
      await client.query(
        'INSERT INTO data(col1, col2) VALUES($1, $2)',
        [row.col1, row.col2]
      );
    }
    await client.query('COMMIT');
  } catch (e) {
    await client.query('ROLLBACK');
    throw e;
  } finally {
    client.release();
  }
}

결과

메모리 사용량: 8GB → 200MB 이하로 감소
처리 시간: 약 4시간 소요 (100GB 기준)
안정성: 중간에 실패해도 재시작 가능하도록 offset 저장 로직 추가

배치 사이즈를 1000으로 설정한 건 실험 결과였다. 100은 너무 느렸고, 10000은 트랜잭션 타임아웃이 발생했다.

추가 개선 포인트

더 빠른 처리가 필요하다면 COPY 명령어나 bulk insert를 고려할 수 있을 것 같다. 하지만 지금은 이 정도면 충분했다.