parttimejob/node_modules/needle/test/decoder_spec.js

var should  = require('should'),
    needle  = require('./../'),
    decoder = require('./../lib/decoder'),
    Q       = require('q'),
    chardet = require('jschardet'),
    fs      = require('fs'),
    http    = require('http'),
    helpers = require('./helpers');

describe('character encoding', function() {

  this.timeout(5000);

  function staticServerFor(file, content_type) {
    return http.createServer(function(req, res) {
      req.on('data', function(chunk) {})
      req.on('end', function() {
        // We used to pull from a particular site that is no longer up.
        // This is a local mirror pulled from archive.org
        // https://web.archive.org/web/20181003202907/http://www.nina.jp/server/slackware/webapp/tomcat_charset.html
        fs.readFile(file, function(err, data) {
          if (err) {
            res.writeHead(404);
            res.end(JSON.stringify(err));
            return;
          }
          res.writeHeader(200, { 'Content-Type': content_type })
          res.end(data);
        });
      })
    })
  }

  describe('Given content-type: "text/html; charset=EUC-JP"', function() {
    var server, port = 2233;

    before(function(done) {
      server = staticServerFor('test/files/tomcat_charset.html', 'text/html; charset=EUC-JP')
      server.listen(port, done)
      url = 'http://localhost:' + port;
    })

    after(function(done) {
      server.close(done)
    })

    describe('with decode = false', function() {
      it('does not decode', function(done) {
        needle.get(url, { decode: false }, function(err, resp) {
          resp.body.should.be.a.String;
          chardet.detect(resp.body).encoding.should.eql('windows-1252');
          resp.body.indexOf('EUCを使う').should.eql(-1);
          done();
        })
      })
    })

    describe('with decode = true', function() {
      it('decodes', function(done) {
        needle.get(url, { decode: true }, function(err, resp) {
          resp.body.should.be.a.String;
          chardet.detect(resp.body).encoding.should.eql('ascii');
          resp.body.indexOf('EUCを使う').should.not.eql(-1);
          done();
        })
      })
    })
  })

  describe('Given content-type: "text/html but file is charset: gb2312', function() {

    it('encodes to UTF-8', function(done) {

      // Our Needle wrapper that requests a chinese website.
      var task    = Q.nbind(needle.get, needle, 'http://www.chinesetop100.com/');

      // Different instantiations of this task
      var tasks   = [Q.fcall(task, {decode: true}),
                     Q.fcall(task, {decode: false})];

      var results = tasks.map(function(task) {
        return task.then(function(obj) {
          return obj[0].body;
        });
      });

      // Execute all requests concurrently
      Q.all(results).done(function(bodies) {

        var charsets = [
          chardet.detect(bodies[0]).encoding,
          chardet.detect(bodies[1]).encoding,
        ]

        // We wanted to decode our first stream as specified by options
        charsets[0].should.equal('ascii');
        bodies[0].indexOf('全球中文网站前二十强').should.not.equal(-1);

        // But not our second stream
        charsets[1].should.equal('windows-1252');
        bodies[1].indexOf('全球中文网站前二十强').should.equal(-1);

        done();
      });
    })
  })

  describe('Given content-type: text/html; charset=maccentraleurope', function() {
    var server, port = 2233;

    // from 'https://wayback.archive-it.org/3259/20160921140616/https://www.arc.gov/research/MapsofAppalachia.asp?MAP_ID=11';
    before(function(done) {
      server = staticServerFor('test/files/Appalachia.html', 'text/html; charset=maccentraleurope')
      server.listen(port, done)
      url = 'http://localhost:' + port;
    })

    after(function(done) {
      server.close(done)
    })

    describe('with decode = false', function() {
      it('does not decode', function(done) {
        needle.get(url, { decode: false }, function(err, resp) {
          resp.body.should.be.a.String;
          chardet.detect(resp.body).encoding.should.eql('ascii');
          done();
        })
      })
    })

    describe('with decode = true', function() {
      it('does not explode', function(done) {
        (function() {
          needle.get(url, { decode: true }, function(err, resp) {
            resp.body.should.be.a.String;
            chardet.detect(resp.body).encoding.should.eql('ascii');
            done();
          })
        }).should.not.throw();
      })
    })
  })

  describe('Given content-type: "text/html"', function () {

    var server,
        port = 54321,
        text = 'Magyarországi Fióktelepe'

    before(function(done) {
      server = helpers.server({
        port: port,
        response: text,
        headers: { 'Content-Type': 'text/html' }
      }, done);
    })

    after(function(done) {
      server.close(done)
    })

    describe('with decode = false', function () {
      it('decodes by default to utf-8', function (done) {

        needle.get('http://localhost:' + port, { decode: false }, function (err, resp) {
          resp.body.should.be.a.String;
          chardet.detect(resp.body).encoding.should.eql('ISO-8859-2');
          resp.body.should.eql('Magyarországi Fióktelepe')
          done();
        })

      })

    })
  })
  
  describe('multibyte characters split across chunks', function () {

    describe('with encoding = utf-8', function() {
    
      var d, 
        result = Buffer.allocUnsafe(0);

      before(function(done) {
        d = decoder('utf-8');
        done();
      });

      it('reassembles split multibyte characters', function (done) {

        d.on("data", function(chunk){
          result = Buffer.concat([ result, chunk ]);
        });

        d.on("end", function(){
          result.toString("utf-8").should.eql('慶');
          done();
        });

        // write '慶' in utf-8 split across chunks
        d.write(Buffer.from([0xE6]));
        d.write(Buffer.from([0x85]));
        d.write(Buffer.from([0xB6]));
        d.end();

      })
    })
    
    describe('with encoding = euc-jp', function() {
    
      var d, 
        result = Buffer.allocUnsafe(0);

      before(function(done) {
        d = decoder('euc-jp');
        done();
      });

      it('reassembles split multibyte characters', function (done) {

        d.on("data", function(chunk){
          result = Buffer.concat([ result, chunk ]);
        });

        d.on("end", function(){
          result.toString("utf-8").should.eql('慶');
          done();
        });

        // write '慶' in euc-jp split across chunks
        d.write(Buffer.from([0xB7]));
        d.write(Buffer.from([0xC4]));
        d.end();

      })
    })
    
    describe('with encoding = gb18030', function() {
    
      var d, 
        result = Buffer.allocUnsafe(0);

      before(function(done) {
        d = decoder('gb18030');
        done();
      });

      it('reassembles split multibyte characters', function (done) {

        d.on("data", function(chunk){
          result = Buffer.concat([ result, chunk ]);
        });

        d.on("end", function(){
          result.toString("utf-8").should.eql('慶');
          done();
        });

        // write '慶' in gb18030 split across chunks
        d.write(Buffer.from([0x91]));
        d.write(Buffer.from([0x63]));
        d.end();

      })
    })

  })
  
})
init 4 weeks ago			`var should = require('should'),`
			`needle = require('./../'),`
			`decoder = require('./../lib/decoder'),`
			`Q = require('q'),`
			`chardet = require('jschardet'),`
			`fs = require('fs'),`
			`http = require('http'),`
			`helpers = require('./helpers');`

			`describe('character encoding', function() {`

			`this.timeout(5000);`

			`function staticServerFor(file, content_type) {`
			`return http.createServer(function(req, res) {`
			`req.on('data', function(chunk) {})`
			`req.on('end', function() {`
			`// We used to pull from a particular site that is no longer up.`
			`// This is a local mirror pulled from archive.org`
			`// https://web.archive.org/web/20181003202907/http://www.nina.jp/server/slackware/webapp/tomcat_charset.html`
			`fs.readFile(file, function(err, data) {`
			`if (err) {`
			`res.writeHead(404);`
			`res.end(JSON.stringify(err));`
			`return;`
			`}`
			`res.writeHeader(200, { 'Content-Type': content_type })`
			`res.end(data);`
			`});`
			`})`
			`})`
			`}`

			`describe('Given content-type: "text/html; charset=EUC-JP"', function() {`
			`var server, port = 2233;`

			`before(function(done) {`
			`server = staticServerFor('test/files/tomcat_charset.html', 'text/html; charset=EUC-JP')`
			`server.listen(port, done)`
			`url = 'http://localhost:' + port;`
			`})`

			`after(function(done) {`
			`server.close(done)`
			`})`

			`describe('with decode = false', function() {`
			`it('does not decode', function(done) {`
			`needle.get(url, { decode: false }, function(err, resp) {`
			`resp.body.should.be.a.String;`
			`chardet.detect(resp.body).encoding.should.eql('windows-1252');`
			`resp.body.indexOf('EUCを使う').should.eql(-1);`
			`done();`
			`})`
			`})`
			`})`

			`describe('with decode = true', function() {`
			`it('decodes', function(done) {`
			`needle.get(url, { decode: true }, function(err, resp) {`
			`resp.body.should.be.a.String;`
			`chardet.detect(resp.body).encoding.should.eql('ascii');`
			`resp.body.indexOf('EUCを使う').should.not.eql(-1);`
			`done();`
			`})`
			`})`
			`})`
			`})`

			`describe('Given content-type: "text/html but file is charset: gb2312', function() {`

			`it('encodes to UTF-8', function(done) {`

			`// Our Needle wrapper that requests a chinese website.`
			`var task = Q.nbind(needle.get, needle, 'http://www.chinesetop100.com/');`

			`// Different instantiations of this task`
			`var tasks = [Q.fcall(task, {decode: true}),`
			`Q.fcall(task, {decode: false})];`

			`var results = tasks.map(function(task) {`
			`return task.then(function(obj) {`
			`return obj[0].body;`
			`});`
			`});`

			`// Execute all requests concurrently`
			`Q.all(results).done(function(bodies) {`

			`var charsets = [`
			`chardet.detect(bodies[0]).encoding,`
			`chardet.detect(bodies[1]).encoding,`
			`]`

			`// We wanted to decode our first stream as specified by options`
			`charsets[0].should.equal('ascii');`
			`bodies[0].indexOf('全球中文网站前二十强').should.not.equal(-1);`

			`// But not our second stream`
			`charsets[1].should.equal('windows-1252');`
			`bodies[1].indexOf('全球中文网站前二十强').should.equal(-1);`

			`done();`
			`});`
			`})`
			`})`

			`describe('Given content-type: text/html; charset=maccentraleurope', function() {`
			`var server, port = 2233;`

			`// from 'https://wayback.archive-it.org/3259/20160921140616/https://www.arc.gov/research/MapsofAppalachia.asp?MAP_ID=11';`
			`before(function(done) {`
			`server = staticServerFor('test/files/Appalachia.html', 'text/html; charset=maccentraleurope')`
			`server.listen(port, done)`
			`url = 'http://localhost:' + port;`
			`})`

			`after(function(done) {`
			`server.close(done)`
			`})`

			`describe('with decode = false', function() {`
			`it('does not decode', function(done) {`
			`needle.get(url, { decode: false }, function(err, resp) {`
			`resp.body.should.be.a.String;`
			`chardet.detect(resp.body).encoding.should.eql('ascii');`
			`done();`
			`})`
			`})`
			`})`

			`describe('with decode = true', function() {`
			`it('does not explode', function(done) {`
			`(function() {`
			`needle.get(url, { decode: true }, function(err, resp) {`
			`resp.body.should.be.a.String;`
			`chardet.detect(resp.body).encoding.should.eql('ascii');`
			`done();`
			`})`
			`}).should.not.throw();`
			`})`
			`})`
			`})`

			`describe('Given content-type: "text/html"', function () {`

			`var server,`
			`port = 54321,`
			`text = 'Magyarországi Fióktelepe'`

			`before(function(done) {`
			`server = helpers.server({`
			`port: port,`
			`response: text,`
			`headers: { 'Content-Type': 'text/html' }`
			`}, done);`
			`})`

			`after(function(done) {`
			`server.close(done)`
			`})`

			`describe('with decode = false', function () {`
			`it('decodes by default to utf-8', function (done) {`

			`needle.get('http://localhost:' + port, { decode: false }, function (err, resp) {`
			`resp.body.should.be.a.String;`
			`chardet.detect(resp.body).encoding.should.eql('ISO-8859-2');`
			`resp.body.should.eql('Magyarországi Fióktelepe')`
			`done();`
			`})`

			`})`

			`})`
			`})`

			`describe('multibyte characters split across chunks', function () {`

			`describe('with encoding = utf-8', function() {`

			`var d,`
			`result = Buffer.allocUnsafe(0);`

			`before(function(done) {`
			`d = decoder('utf-8');`
			`done();`
			`});`

			`it('reassembles split multibyte characters', function (done) {`

			`d.on("data", function(chunk){`
			`result = Buffer.concat([ result, chunk ]);`
			`});`

			`d.on("end", function(){`
			`result.toString("utf-8").should.eql('慶');`
			`done();`
			`});`

			`// write '慶' in utf-8 split across chunks`
			`d.write(Buffer.from([0xE6]));`
			`d.write(Buffer.from([0x85]));`
			`d.write(Buffer.from([0xB6]));`
			`d.end();`

			`})`
			`})`

			`describe('with encoding = euc-jp', function() {`

			`var d,`
			`result = Buffer.allocUnsafe(0);`

			`before(function(done) {`
			`d = decoder('euc-jp');`
			`done();`
			`});`

			`it('reassembles split multibyte characters', function (done) {`

			`d.on("data", function(chunk){`
			`result = Buffer.concat([ result, chunk ]);`
			`});`

			`d.on("end", function(){`
			`result.toString("utf-8").should.eql('慶');`
			`done();`
			`});`

			`// write '慶' in euc-jp split across chunks`
			`d.write(Buffer.from([0xB7]));`
			`d.write(Buffer.from([0xC4]));`
			`d.end();`

			`})`
			`})`

			`describe('with encoding = gb18030', function() {`

			`var d,`
			`result = Buffer.allocUnsafe(0);`

			`before(function(done) {`
			`d = decoder('gb18030');`
			`done();`
			`});`

			`it('reassembles split multibyte characters', function (done) {`

			`d.on("data", function(chunk){`
			`result = Buffer.concat([ result, chunk ]);`
			`});`

			`d.on("end", function(){`
			`result.toString("utf-8").should.eql('慶');`
			`done();`
			`});`

			`// write '慶' in gb18030 split across chunks`
			`d.write(Buffer.from([0x91]));`
			`d.write(Buffer.from([0x63]));`
			`d.end();`

			`})`
			`})`

			`})`

			`})`