The bedroom window was a very seedy and disreputable hard-felt hat.

Jeremy Brett as Sherlock Holmes

A Markov chain is a type of statistical model that's used to describe things that happen sequentially. You begin in one state, then there is a certain probability that you will move to each of the next possible states, which is useful for things like finding conserved DNA sequences.

This can also be quite a fun thing to play with - it's great for taking in text and trying to make sensible-sounding sentences out of it. The idea being that instead of understanding what the sentences actually mean, you can just see what word usually comes after the word you started with and pick one of them to go next.

So I decided to model the English language as a Markov chain, using the text from The Adventures of Sherlock Holmes (from Project Gutenberg) as training data, and produced about as much coherence as you'd expect from such a method. If you go to http://bethmcmillan.com/geek/markov/, you can generate your very own pseudo-sentence.

I also made a Twitter bot that tweets these nonsense Holmesian sentences.

In brief, I installed node.js, which lets you run JavaScript without a browser, and added the "twit", "jsdom" and "jquery" modules. I followed this tutorial for making a twitter bot. The bot tweets every 5 minutes (I might change this if it turns out to be too much). After stripping the newlines, quotation marks and double spaces from the text, it picks a random word to begin with. Then, it takes this random word and the one that follows it, and finds all of the other places in the text where this pair of words can be found. Next, at random, it picks one of these locations and takes the next word in the sentence. Finally, the process repeats with the two newest words until there's a tweet-length phrase.

All my code's available under the fold, for anyone who's interested. Feel free to follow @markov_holmes for entertaining gibberish!


setInterval(function () {
  'use strict';
var Twit = require('twit');

var T = new Twit({
    consumer_key: REDACTED
  , consumer_secret: REDACTED
  , access_token: REDACTED
  , access_token_secret: REDACTED

})

  var env = require('jsdom').env ;

//read in file
  env("pg1661.txt", function (errors, window) {
  if (errors)
    {console.log(errors);}
  var $ = require('jquery')(window);

  //get data out of text file and put into array
  var data = $("body").text();

  //get rid of newlines, double spaces and quote marks
  data = data.replace(/\r\n/g,' ');
  data = data.replace(/  /g,' ');
  data = data.replace(/\"/g,'');
  var words = data.split(" ");

  //begin with two random consecutive words, the first of which is capitalised
  var capital = 0;
  var give_up = 0;
  while (!capital)
  {    
    var index = Math.floor(Math.random()*words.length);
    // check that the first letter is capitalised
    if (words[index][0] == words[index][0].toUpperCase() && words[index].indexOf("\"") == -1)
    {
      capital = 1;
    }  
    give_up +=1;
    if (give_up > 200)
    {
      break;
    }
  }

  var answer = [words[index], words[index+1]];

  var checkindices = [index, index+1];

  var lenout = 2;

  give_up = 0;

  // measure length of answer
  var length_of_answer = 0;
  for (var l = 0; l < answer.length; l++)
  {
    length_of_answer += answer[l].length;    
  }
  // add spaces to length count
  length_of_answer += answer.length-1;

  while (length_of_answer < 130)
  {
    // from the previous two words, find all instances of the first one within the text
    var indices = new Array();
    for (var i = 0; i < words.length; i++)
    {
      // check that the first word matches 
      if (words[i] == answer[lenout-2])
      {
        // check that the second word matches
        if (words[i+1] == answer[lenout-1])
        {
          indices.push(i+2);  
        }
      }

    }

    // pick next word at random from pool of words
    var newindex = Math.floor(Math.random()*indices.length);
    var next_word = words[indices[newindex]];
    answer[lenout] = next_word;

    // also save the index
    checkindices.push(indices[newindex]);
    lenout +=1;
    if (next_word)
    {
    length_of_answer += next_word.length + 1;
    }
    give_up +=1;
    if (give_up > 200)
    {
      break;
    }
  }

  var lastword = answer.slice(-1); 
  var lastcharacter = lastword[0].slice(-1);
  var appendedcharacter = '';
  // End with punctuation
  if (lastcharacter != '.' || lastcharacter != '?' || lastcharacter != '!')
  {
    appendedcharacter = '.';
  }
  answer = (answer.join(" ")+appendedcharacter);
    console.log(answer);

T.post('statuses/update', { status: answer }, function(err, reply) {
  //  ...
    if(err) return handleError(err)
})

  });

},300000);

...

Leave a Reply

Your email address will not be published. Required fields are marked *

*

*

This site uses Akismet to reduce spam. Learn how your comment data is processed.