Skip to content
This repository was archived by the owner on Aug 28, 2019. It is now read-only.

matpalm/wikipediaPhilosophy

Folders and files

NameName
Last commit message
Last commit date
Aug 16, 2011
Aug 14, 2011
Aug 11, 2011
Aug 21, 2011
Feb 6, 2013
Aug 21, 2011
Aug 21, 2011
Aug 14, 2011
Aug 16, 2011
Aug 16, 2011
Aug 21, 2011
Aug 21, 2011
Aug 21, 2011
Aug 16, 2011
Aug 14, 2011
Aug 21, 2011
Aug 21, 2011
Aug 21, 2011
Aug 21, 2011
Aug 21, 2011
Aug 21, 2011
Aug 21, 2011
Aug 14, 2011

Repository files navigation

see http://matpalm.com/blog/2011/08/13/wikipedia-philosophy

-----

# get data
cd data
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2   # 9gb

# flatten to single line
bzcat enwiki-latest-pages-articles.xml.bz2 | ~/flatten_to_one_page_per_line.py > enwiki-latest-pages-articles.pageperline.xml # 30gb

# split into redirects and articles
cat enwiki-latest-pages-articles.pageperline.xml | grep \<redirect\ \/\> > enwiki-latest-pages-redirects.xml &   
cat enwiki-latest-pages-articles.pageperline.xml | grep -v \<redirect\ \/\> > enwiki-latestXS-pages-articles.xml & 
wait

# move xml for articles and redirects into hdfs 
hadoop fs -mkdir /full/articles.xml
hadoop fs -copyFromLocal enwiki-latest-pages-articles.xml /full/articles.xml
hadoop fs -mkdir /full/redirects.xml
hadoop fs -copyFromLocal enwiki-latest-pages-redirects.xml /full/redirects.xml

# parse redirects
cd
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /full/redirects.xml -output /full/redirects \
 -mapper redirect_parser.py -file redirect_parser.py

# dereference all redirects to their final targets
pig -p INPUT=/full/redirects -p OUTPUT=/full/redirects.dereferenced1 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced1 -p OUTPUT=/full/redirects.dereferenced2 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced2 -p OUTPUT=/full/redirects.dereferenced3 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced3 -p OUTPUT=/full/redirects.dereferenced4 -f dereference_redirects.pig
hfs -mv /full/redirects /full/redirects.original
hfs -mv /full/redirects.dereferenced4 /full/redirects

# run extraction
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /full/articles.xml -output /full/edges \
 -mapper article_parser.py -file article_parser.py

# run redirects against edges
pig -p INPUT=/full/edges -p OUTPUT=/full/edges.dereferenced -f dereference_redirects.pig

# as a sanity check, should be same
# pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced.sanity -f dereference_redirects.pig 

# get to local filesystem
hadoop fs -cat /full/edges.dereferenced/* > edges
# hadoop fs -cat /full/edges.dereferenced.sanity/* > edges.sanity

# add special cases, the parser will, alas, never be perfect...
cat manually_derived_edges >> edges

# calculate distance from Philosophy
java -Xmx8g -cp distanceToPhilosophy/bin/ DistanceToPhilosophy \
 Philosophy edges \
 >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr

# order nodes by their number of descendants
grep ^FINAL DistanceToPhilosophy.stdout | sed -es/FINAL\ // > distances
cut -f1 distances > articles
hfs -mkdir /articles_that_led_to_philosophy
hfs -copyFromLocal articles /articles_that_led_to_philosophy
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /articles_that_led_to_philosophy -output /num_descendants \
 -mapper 'count_descendants.py edges' -file count_descendants.py -file edges \
 -reducer aggregate
hfs -cat /num_descendants/* | sort -k2 -t"    " -nr > descendants.sorted

# draw graph of the top 1000
head -n 200 descendants.sorted > descendants.top200
./filter_nodes.py descendants.top200 < edges > filtered.edges.top200
./to_dot.py filtered.edges.top200 descendants.top200 | dot -Tpng > top200.png


About

do all first links on wikipedia _really_ lead to philosophy?

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published