Skip to content

Delayed setEncoding on Readable causes multibyte characters to break #27932

Closed
@verdaster

Description

@verdaster
  • Version: v10.15.3
  • Platform: 4.15.0-50-generic #54~16.04.1-Ubuntu x86_64 GNU/Linux
  • Subsystem: stream/http

Server

const http = require('http');

const server = http.createServer(async (req, res) => {
    const data = [];
    // simulate some preprocessing - eg. authentication in middleware
    // if setEncoding is called before async operation all works correctly
    await sleep(2000);
    req.setEncoding('utf8');
    req.on('data', (d) => {
        // already buffered chunks are read as buffer, although encoding is set to utf8
        data.push(d);
        console.log(d);
    });
    req.on('end', () => {
        const body = data.join('');
        // expected {"řeřicha":"hello čača"} but got {"��eřicha":"hello čača"}
        console.log(body);
        console.log(body.toString());
        res.statusCode = 204;
        res.end();
    });
});

server.listen(3000, 'localhost', () => {
    console.log('LISTENING');
});

async function sleep(ms = 1000) {
    return new Promise((res) => setTimeout(res, ms));
}

Client

const http = require('http');

const data = Buffer.from(JSON.stringify({ řeřicha: 'hello čača' }));

async function main() {
    const req = http.request(
        {
            hostname: 'localhost',
            port: 3000,
            method: 'POST',
            path: '/',
            headers: {
                'content-type': 'application/json',
                'content-length': data.length
            }
        },
        (res) => {
            console.log(res.statusCode);
            console.log(res.headers);
        }
    );

    const offset = 3;
    // write some bytes
    req.write(data.slice(0, offset));
    for (let i = offset; i < data.length; i++) {
        // simulate very slow connection
        await sleep();
        req.write(data.slice(i, i + 1));
    }
    req.end();
}

main();

async function sleep(ms = 1000) {
    return new Promise((res) => setTimeout(res, ms));
}

When setEncoding('utf8') is used on Readable stream (eg. IncomingMessage) with some delay, although no data are read, then already buffered chunks are read as buffer which breaks multibyte characters.

If encoding is set immediately, eg. on http 'request' event, than it works properly, however http frameworks (Express, Fastify, Koa) allows including asynchronous middlewares in processing pipeline before body parsing occurs, which delays setEncoding call and thus breaks multibyte characters.

Included code simulates this delay as two first emitted 'data' chunks are buffers which in standard body processing breaks characters if chunk contains only some bytes of unicode character. In case of http body parsing middlewares this also breaks content-length check.

Example body processing middleware from Fastify v1.14.6

function rawBody (request, reply, options, parser, done) {
  var asString = parser.asString
  var limit = options.limit === null ? parser.bodyLimit : options.limit
  var contentLength = request.headers['content-length'] === undefined
    ? NaN
    : Number.parseInt(request.headers['content-length'], 10)

  if (contentLength > limit) {
    const err = new Error('Request body is too large')
    err.statusCode = 413
    reply.code(err.statusCode).send(err)
    return
  }

  var receivedLength = 0
  var body = asString === true ? '' : []
  var req = request.raw

  if (asString === true) {
    req.setEncoding('utf8')
  }

  req.on('data', onData)
  req.on('end', onEnd)
  req.on('error', onEnd)

  function onData (chunk) {
    receivedLength += chunk.length

    if (receivedLength > limit) {
      req.removeListener('data', onData)
      req.removeListener('end', onEnd)
      req.removeListener('error', onEnd)
      const err = new Error('Request body is too large')
      err.statusCode = 413
      reply.code(err.statusCode).send(err)
      return
    }

    if (asString === true) {
      // first chunks might be buffers automatically coerced to strings
      // with broken multibytes characters
      body += chunk
    } else {
      body.push(chunk)
    }
  }

  function onEnd (err) {
    req.removeListener('data', onData)
    req.removeListener('end', onEnd)
    req.removeListener('error', onEnd)

    if (err !== undefined) {
      err.statusCode = 400
      reply.code(err.statusCode).send(err)
      return
    }

    if (asString === true) {
      receivedLength = Buffer.byteLength(body)
    }
    if (!Number.isNaN(contentLength) && receivedLength !== contentLength) {
      const err = new Error('Request body size did not match Content-Length')
      err.statusCode = 400
      reply.code(err.statusCode).send(err)
      return
    }

    if (asString === false) {
      body = Buffer.concat(body)
    }

    var result = parser.fn(req, body, done)
    if (result && typeof result.then === 'function') {
      result.then(body => done(null, body), done)
    }
  }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    confirmed-bugIssues with confirmed bugs.streamIssues and PRs related to the stream subsystem.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions