Archive for category software development

Simple Scala Map/Reduce Job

I was recently tasked with writing a Hadoop map/reduce job. This job had the requirement of taking a list of regular expressions and scouring hundreds of gigs worth of log files for matches. Since I’ve been leaning more and more towards Scala I wanted to use it for my job but I also wanted to use Maven for my job’s package management to make the job easy to setup and extend. And finally, I wanted to have unit tests for my mapper and reducer and an overall job unit test. The result is this project I posted to GitHub as a template for future projects. I hope it proves as helpful for others as I’m sure it’ll be for me.

Tags: , , , ,

Select distinct for MongoDB

Here is a handy script I’ve been using for MongoDB to retrieve a list of all the fields used in a collection. This uses a map/reduce routine and has to comb over all the documents in a collection so you may want to exercise caution when using this script.

// usage:
// mongo localhost/foo --quiet --eval="var collection='bar';" getcollectionkeys.js
var mr = db.runCommand({
  "mapreduce":collection,
  "map":function() {
    for (var key in this) { emit(key, null); }
  },
  "reduce":function(key, stuff) { return null; }, 
  "out":collection + "_keys"
})

print(db[mr.result].distinct("_id"))

db[collection+"_keys"].drop()

Tags: , , ,

Simple PhantomJS web scraping script

Here is a simple web scraping script I wrote for PhantomJS, the immensely useful headless browser, to load a page, inject jQuery into it, and then scrape the page using a user-supplied jQuery selector.

page = require('webpage').create()
system = require 'system'

phantom.injectJs "static/js/underscore-min.js"

page.onConsoleMessage = (msg) ->
    if not msg.match /^Unsafe/
        console.log msg

scrapeEl = (elselector) ->
    rows = $ elselector
    for el in rows
        if el.innerHTML
            str = el.innerHTML.trim()
            if str.length > 0
                console.log str

page.open system.args[1], (status) ->
    if status isnt 'success'
        phantom.exit 1
    else
        page.injectJs "static/js/underscore-min.js"
        page.injectJs "static/js/utils.js"
        page.injectJs "static/js/jquery-1.8.2.min.js"
        page.evaluate scrapeEl, system.args[2]
        phantom.exit()

Run it with:

phantomjs scrape_element.coffee "http://www.moviefone.com/coming-soon" ".movieTitle span"

Tags:

node.js at Facebook

Slides

Tags: , , ,

Simple init.d script template

Recently I found the need to create an init.d script and since I had a hard time finding an example elsewhere1, here’s the overly simple script I came up with to get the job done:

#!/bin/bash
# myapp daemon
# chkconfig: 345 20 80
# description: myapp daemon
# processname: myapp

DAEMON_PATH="/home/wes/Development/projects/myapp"

DAEMON=myapp
DAEMONOPTS="-my opts"

NAME=myapp
DESC="My daemon description"
PIDFILE=/var/run/$NAME.pid
SCRIPTNAME=/etc/init.d/$NAME

case "$1" in
start)
	printf "%-50s" "Starting $NAME..."
	cd $DAEMON_PATH
	PID=`$DAEMON $DAEMONOPTS > /dev/null 2>&1 & echo $!`
	#echo "Saving PID" $PID " to " $PIDFILE
        if [ -z $PID ]; then
            printf "%s\n" "Fail"
        else
            echo $PID > $PIDFILE
            printf "%s\n" "Ok"
        fi
;;
status)
        printf "%-50s" "Checking $NAME..."
        if [ -f $PIDFILE ]; then
            PID=`cat $PIDFILE`
            if [ -z "`ps axf | grep ${PID} | grep -v grep`" ]; then
                printf "%s\n" "Process dead but pidfile exists"
            else
                echo "Running"
            fi
        else
            printf "%s\n" "Service not running"
        fi
;;
stop)
        printf "%-50s" "Stopping $NAME"
            PID=`cat $PIDFILE`
            cd $DAEMON_PATH
        if [ -f $PIDFILE ]; then
            kill -HUP $PID
            printf "%s\n" "Ok"
            rm -f $PIDFILE
        else
            printf "%s\n" "pidfile not found"
        fi
;;

restart)
  	$0 stop
  	$0 start
;;

*)
        echo "Usage: $0 {status|start|stop|restart}"
        exit 1
esac

This script will work in /etc/init.d on Xubuntu 11.10 (so most Debian-based systems) and CentOS 5.5 and you can control it via chkconfig.

  1. That said, if you know of such an example I’d love to hear from you. []

Tags: , , , , ,

Fun with jsonselect

One of the strengths of CSS and jQuery is that it provides a common and powerful mechanism known as a selector language for referencing bits of data, especially data whose structure is not exactly known at runtime which makes such an addressing scheme a perfect fit for the often lumpy world of HTML.

Increasingly JSON is being used as a transport medium for data and with the rise of NoSQL solutions, having a selector language for JSON makes a lot of sense when dealing with JSON documents whose structure isn’t deterministic.

JSONSelect provides a good implementation of just such a JSON selector language but after working with it on a project I found myself needing to do more than it allowed me to do. Namely, I wanted 1. to be able to perform a selection and receive matching paths instead of the data contained in those paths and I wanted 2. to be able to modify data specified at a path location in-place.

jsonselect.match(sel, obj, asPath); // Added the asPath flag to return a path instead of the values
jsonselect.forEach(sel, obj, fun, asPath); // Added the same flag to forEach, I use this to 
jsonselect.get(path,obj); // For getting the value using a path
jsonselect.set(path, value, obj); // For setting the value of a path
jsonselect.del(path,root); // For deleting a path

Here is my modified version of jsonselect in case anyone needs help solving the same problems I mentioned above.

Tags: , , , ,

Finding yesterday’s beginning and ending unix timestamp

When writing reports I’ve often come across the need to find the unix timestamp beginning and end of a day. Here is a Python snippet that does just that.

yesterday = datetime.datetime.now() - datetime.timedelta(days = 1)
yesterday_beginning = datetime.datetime(yesterday.year, yesterday.month, yesterday.day,0,0,0,0)
yesterday_beginning_time = int(time.mktime(yesterday_beginning.timetuple()))
yesterday_end = datetime.datetime(yesterday.year, yesterday.month, yesterday.day,23,59,59,999)
yesterday_end_time = int(time.mktime(yesterday_end.timetuple()))

print yesterday_beginning_time
print yesterday_end_time

Tags: , , ,

Check an array for all null values

Here is a simple function to check an array to see if it contains all null values.

function allNulls($arr) {
    if(is_array($arr) && count(array_diff($arr, array(null))) == 0) { 
            return true;
    }
    
    return false;
}

echo (allNulls(array(null,null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array(null,1,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array("test",null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array("",null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array(0,null,null)) ? "true" : "false") . PHP_EOL;

Tags: , , ,

Simple PHP Proxy

While developing apps that use external web services, a proxy often comes in handy in order to bypass the pesky XSS security settings found in most browsers. Here is a simple PHP proxy I’ve found quite helpful.



                      

Tags: ,

Dividing a file into chunks along line endings in Erlang

I’ve been dabbling in Erlang recently. I’ve wanted to learn a functional programming language for a while now and Erlang’s concurrency make it rather attractive.

For my “hello world” app, I decided to write a simple log parser which processes chunks of a file in parallel. Here is a part of that app which produces a list of tuples which describes the chunks adjusted to the nearest newline (Unix newlines, \n, in this case).

getChunkDivisions(File, 0, Chunksize, ChunkDivisions) ->
	[{chunk,0,Chunksize}|ChunkDivisions];
getChunkDivisions(File, Size, Chunksize, ChunkDivisions) ->
	if
		Size-Chunksize=<0 -> 
			ComputedChunkEnd = Chunksize-(Chunksize-Size),
			CorrectedChunkEnd = walkToNextLineBreak(File, ComputedChunkEnd),
			getChunkDivisions(File, 0, CorrectedChunkEnd, ChunkDivisions);
		true		 -> 
			ComputedChunkEnd = Size-Chunksize,
			CorrectedChunkEnd = walkToNextLineBreak(File, ComputedChunkEnd),
			getChunkDivisions(File, CorrectedChunkEnd, Chunksize, [{chunk,CorrectedChunkEnd,Size}|ChunkDivisions])
	end.

walkToNextLineBreak(File,Start) ->
	file:position(File, Start-1),
	{ok, Data} = file:read(File, 1024),
	Start+string:chr(Data, $\n).