I was recently tasked with writing a Hadoop map/reduce job. This job had the requirement of taking a list of regular expressions and scouring hundreds of gigs worth of log files for matches. Since I’ve been leaning more and more towards Scala I wanted to use it for my job but I also wanted to use Maven for my job’s package management to make the job easy to setup and extend. And finally, I wanted to have unit tests for my mapper and reducer and an overall job unit test. The result is this project I posted to GitHub as a template for future projects. I hope it proves as helpful for others as I’m sure it’ll be for me.
Archive for category software development
Simple Scala Map/Reduce Job
Oct 27
Select distinct for MongoDB
Oct 23
Here is a handy script I’ve been using for MongoDB to retrieve a list of all the fields used in a collection. This uses a map/reduce routine and has to comb over all the documents in a collection so you may want to exercise caution when using this script.
// usage:
// mongo localhost/foo --quiet --eval="var collection='bar';" getcollectionkeys.js
var mr = db.runCommand({
"mapreduce":collection,
"map":function() {
for (var key in this) { emit(key, null); }
},
"reduce":function(key, stuff) { return null; },
"out":collection + "_keys"
})
print(db[mr.result].distinct("_id"))
db[collection+"_keys"].drop()
Here is a simple web scraping script I wrote for PhantomJS, the immensely useful headless browser, to load a page, inject jQuery into it, and then scrape the page using a user-supplied jQuery selector.
page = require('webpage').create()
system = require 'system'
phantom.injectJs "static/js/underscore-min.js"
page.onConsoleMessage = (msg) ->
if not msg.match /^Unsafe/
console.log msg
scrapeEl = (elselector) ->
rows = $ elselector
for el in rows
if el.innerHTML
str = el.innerHTML.trim()
if str.length > 0
console.log str
page.open system.args[1], (status) ->
if status isnt 'success'
phantom.exit 1
else
page.injectJs "static/js/underscore-min.js"
page.injectJs "static/js/utils.js"
page.injectJs "static/js/jquery-1.8.2.min.js"
page.evaluate scrapeEl, system.args[2]
phantom.exit()
Run it with:
phantomjs scrape_element.coffee "http://www.moviefone.com/coming-soon" ".movieTitle span"
node.js at Facebook
Jan 23
Recently I found the need to create an init.d script and since I had a hard time finding an example elsewhere1, here’s the overly simple script I came up with to get the job done:
#!/bin/bash
# myapp daemon
# chkconfig: 345 20 80
# description: myapp daemon
# processname: myapp
DAEMON_PATH="/home/wes/Development/projects/myapp"
DAEMON=myapp
DAEMONOPTS="-my opts"
NAME=myapp
DESC="My daemon description"
PIDFILE=/var/run/$NAME.pid
SCRIPTNAME=/etc/init.d/$NAME
case "$1" in
start)
printf "%-50s" "Starting $NAME..."
cd $DAEMON_PATH
PID=`$DAEMON $DAEMONOPTS > /dev/null 2>&1 & echo $!`
#echo "Saving PID" $PID " to " $PIDFILE
if [ -z $PID ]; then
printf "%s\n" "Fail"
else
echo $PID > $PIDFILE
printf "%s\n" "Ok"
fi
;;
status)
printf "%-50s" "Checking $NAME..."
if [ -f $PIDFILE ]; then
PID=`cat $PIDFILE`
if [ -z "`ps axf | grep ${PID} | grep -v grep`" ]; then
printf "%s\n" "Process dead but pidfile exists"
else
echo "Running"
fi
else
printf "%s\n" "Service not running"
fi
;;
stop)
printf "%-50s" "Stopping $NAME"
PID=`cat $PIDFILE`
cd $DAEMON_PATH
if [ -f $PIDFILE ]; then
kill -HUP $PID
printf "%s\n" "Ok"
rm -f $PIDFILE
else
printf "%s\n" "pidfile not found"
fi
;;
restart)
$0 stop
$0 start
;;
*)
echo "Usage: $0 {status|start|stop|restart}"
exit 1
esac
This script will work in /etc/init.d on Xubuntu 11.10 (so most Debian-based systems) and CentOS 5.5 and you can control it via chkconfig.
- That said, if you know of such an example I’d love to hear from you. [↩]
Fun with jsonselect
Nov 16
One of the strengths of CSS and jQuery is that it provides a common and powerful mechanism known as a selector language for referencing bits of data, especially data whose structure is not exactly known at runtime which makes such an addressing scheme a perfect fit for the often lumpy world of HTML.
Increasingly JSON is being used as a transport medium for data and with the rise of NoSQL solutions, having a selector language for JSON makes a lot of sense when dealing with JSON documents whose structure isn’t deterministic.
JSONSelect provides a good implementation of just such a JSON selector language but after working with it on a project I found myself needing to do more than it allowed me to do. Namely, I wanted 1. to be able to perform a selection and receive matching paths instead of the data contained in those paths and I wanted 2. to be able to modify data specified at a path location in-place.
jsonselect.match(sel, obj, asPath); // Added the asPath flag to return a path instead of the values jsonselect.forEach(sel, obj, fun, asPath); // Added the same flag to forEach, I use this to jsonselect.get(path,obj); // For getting the value using a path jsonselect.set(path, value, obj); // For setting the value of a path jsonselect.del(path,root); // For deleting a path
Here is my modified version of jsonselect in case anyone needs help solving the same problems I mentioned above.
When writing reports I’ve often come across the need to find the unix timestamp beginning and end of a day. Here is a Python snippet that does just that.
yesterday = datetime.datetime.now() - datetime.timedelta(days = 1) yesterday_beginning = datetime.datetime(yesterday.year, yesterday.month, yesterday.day,0,0,0,0) yesterday_beginning_time = int(time.mktime(yesterday_beginning.timetuple())) yesterday_end = datetime.datetime(yesterday.year, yesterday.month, yesterday.day,23,59,59,999) yesterday_end_time = int(time.mktime(yesterday_end.timetuple())) print yesterday_beginning_time print yesterday_end_time
Here is a simple function to check an array to see if it contains all null values.
function allNulls($arr) {
if(is_array($arr) && count(array_diff($arr, array(null))) == 0) {
return true;
}
return false;
}
echo (allNulls(array(null,null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array(null,1,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array("test",null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array("",null,null)) ? "true" : "false") . PHP_EOL;
echo (allNulls(array(0,null,null)) ? "true" : "false") . PHP_EOL;
Simple PHP Proxy
May 24
While developing apps that use external web services, a proxy often comes in handy in order to bypass the pesky XSS security settings found in most browsers. Here is a simple PHP proxy I’ve found quite helpful.
// http://benalman.com/projects/php-simple-proxy/
$url = "http://mcaf.ee/api/shorten";
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, 20);
curl_setopt($ch, CURLOPT_FORBID_REUSE, true);
curl_setopt($ch, CURLOPT_MAXCONNECTS, 16);
curl_setopt($ch, CURLOPT_ENCODING, "gzip");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($_REQUEST));
$result = curl_exec($ch);
$status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
curl_close($ch);
if(!is_null($type)) $type = 'text/html';
header('Content-Type: '.$type);
switch($status) {
case 500:
header("HTTP/1.1 500 Internal Server Error");
header("Cache-Control: no-cache");
break;
case 200:
default:
header("HTTP/1.1 200 OK");
break;
}
echo $result;
I’ve been dabbling in Erlang recently. I’ve wanted to learn a functional programming language for a while now and Erlang’s concurrency make it rather attractive.
For my “hello world” app, I decided to write a simple log parser which processes chunks of a file in parallel. Here is a part of that app which produces a list of tuples which describes the chunks adjusted to the nearest newline (Unix newlines, \n, in this case).
getChunkDivisions(File, 0, Chunksize, ChunkDivisions) ->
[{chunk,0,Chunksize}|ChunkDivisions];
getChunkDivisions(File, Size, Chunksize, ChunkDivisions) ->
if
Size-Chunksize=<0 ->
ComputedChunkEnd = Chunksize-(Chunksize-Size),
CorrectedChunkEnd = walkToNextLineBreak(File, ComputedChunkEnd),
getChunkDivisions(File, 0, CorrectedChunkEnd, ChunkDivisions);
true ->
ComputedChunkEnd = Size-Chunksize,
CorrectedChunkEnd = walkToNextLineBreak(File, ComputedChunkEnd),
getChunkDivisions(File, CorrectedChunkEnd, Chunksize, [{chunk,CorrectedChunkEnd,Size}|ChunkDivisions])
end.
walkToNextLineBreak(File,Start) ->
file:position(File, Start-1),
{ok, Data} = file:read(File, 1024),
Start+string:chr(Data, $\n).

