diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d9be4e5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,67 @@ +*_wrapper.cxx +*.a +*.class +*.d +*.dep +*.o +*.orig +*.py[co] +*.so +*.swp + +/SParseval +/tmp + +evalb/evalb + +build* +dist* + +MANIFEST +regression-test-* +tags +TAGS + +python/bllipparser/CharniakParser.py +python/bllipparser/JohnsonReranker.py + +first-stage/PARSE/evalTree +first-stage/PARSE/parseAndEval +first-stage/PARSE/parseIt +first-stage/PARSE/parser_wrapper.C +first-stage/PARSE/swig/*/build/* +first-stage/PARSE/swig/*/lib/* +first-stage/TRAIN/iScale +first-stage/TRAIN/kn3Counts +first-stage/TRAIN/pSfgT +first-stage/TRAIN/pSgT +first-stage/TRAIN/pTgNt +first-stage/TRAIN/pUgT +first-stage/TRAIN/rCounts +first-stage/TRAIN/selFeats +first-stage/TRAIN/trainRs + +/second-stage/nbest + +second-stage/programs/*/read-tree.cc +second-stage/programs/eval-beam/main +second-stage/programs/eval-weights/eval-weights +second-stage/programs/features/best-*parses +second-stage/programs/features/count-*features +second-stage/programs/features/extract-*features +second-stage/programs/features/oracle-score +second-stage/programs/features/parallel-extract-nfeatures +second-stage/programs/features/parallel-extract-spfeatures +second-stage/programs/features/reranker_wrapper.C +second-stage/programs/features/swig/*/build/* +second-stage/programs/features/swig/*/lib/* +second-stage/programs/prepare-data/copy-trees-ss +second-stage/programs/prepare-data/prepare-ec-data +second-stage/programs/prepare-data/prepare-ec-data100 +second-stage/programs/prepare-data/prepare-new-data +second-stage/programs/prepare-data/ptb +second-stage/programs/wlle/avper +second-stage/programs/wlle/cvlm +second-stage/programs/wlle/cvlm-lbfgs +second-stage/programs/wlle/gavper +second-stage/programs/wlle/oracle diff --git a/Makefile b/Makefile index 4c0c390..343f9b4 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ # # The following high-level goals may also be useful: # -# make nbestrain-clean # removes temporary files used in nbesttrain +# make nbesttrain-clean # removes temporary files used in nbesttrain # make nbest-oracle # oracle evaluation of n-best results # make features # extracts features from 20-fold parses # make train-reranker # trains reranker model @@ -68,12 +68,12 @@ # Version 4.1 and later gcc permit -march=native, but older # versions will need -march=pentium4 or -march=opteron # -# GCCFLAGS = -march=native -mfpmath=sse -msse2 -mmmx -m32 +# GCCFLAGS ?= -march=native -mfpmath=sse -msse2 -mmmx -m32 # CFLAGS is used for all C and C++ compilation # CFLAGS = -MMD -O3 -Wall -ffast-math -finline-functions -fomit-frame-pointer -fstrict-aliasing $(GCCFLAGS) -LDFLAGS = $(GCCLDFLAGS) + EXEC = time # for SWIG wrappers, use these flags instead @@ -88,11 +88,16 @@ EXEC = time # LDFLAGS = -g -Wall $(GCCLDFLAGS) # EXEC = valgrind -CXXFLAGS = $(CFLAGS) -Wno-deprecated +CXXFLAGS ?= $(CFLAGS) -Wno-deprecated export CFLAGS export CXXFLAGS export LDFLAGS +CC ?= gcc +CXX ?= g++ +export CC +export CXX + # Building the 20-fold training data with nbesttrain # -------------------------------------------------- @@ -517,11 +522,14 @@ train-reranker: $(WEIGHTSFILEGZ) # This goal estimates the reranker feature weights (i.e., trains the # reranker). # +# Don't use auto-renaming as in "gzip foo" because it fails if there is +# more than one hardlink on the file (I'm looking at you Time Machine!). +# # $(WEIGHTSFILEGZ): $(ESTIMATOR) $(WEIGHTSFILEGZ): $(ESTIMATOR) $(MODELDIR)/features.gz $(FEATDIR)/train.gz $(FEATDIR)/dev.gz $(FEATDIR)/test1.gz $(ESTIMATORENV) $(ZCAT) $(FEATDIR)/train.gz | $(EXEC) $(ESTIMATOR) $(ESTIMATORFLAGS) -e $(FEATDIR)/dev.gz -f $(MODELDIR)/features.gz -o $(WEIGHTSFILE) -x $(FEATDIR)/test1.gz - rm -f $(WEIGHTSFILEGZ) - gzip $(WEIGHTSFILE) + gzip -c $(WEIGHTSFILE) >$(WEIGHTSFILEGZ) + rm -f $(WEIGHTSFILE) ######################################################################## # # diff --git a/Makefile.mac b/Makefile.mac new file mode 100644 index 0000000..5c455bd --- /dev/null +++ b/Makefile.mac @@ -0,0 +1,68 @@ +# To use these defaults set the MAKEFILES environment variable when calling make. +# export MAKEFILES=`pwd`/Makefile.mac + +uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') + +# For Mavericks (and Mountain Lion) I set up gcc using macports: +# sudo port install gcc47 +# sudo port select --set gcc mp-gcc47 +# sudo port install boost liblbfgs + +# Using MacPorts means that we have to override the default include and library locations. +LD_INCLUDE_PATH=/opt/local/include +LD_LIBRARY_PATH=/opt/local/lib + +export LD_INCLUDE_PATH +export LD_LIBRARY_PATH + +# The SParseval makefile uses a -lm dependency (a bad idea imho) which fails because there +# is no libm.a to be used. This trick works by mapping that to the system's libm.dylib. +# .LIBPATTERNS+=lib%.dylib +# export .LIBPATTERNS + +# On Mac OS X using -march=native doesn't seem to work (a compilation error will occur). +# Turns out there is a problem with AVX instructions on OSX for gcc after 4.2. +# http://stackoverflow.com/questions/12016281/g-no-such-instruction-with-avx +# http://mac-os-forge.2317878.n4.nabble.com/gcc-as-AVX-binutils-and-MacOS-X-10-7-td144472.html +# So here's what works for me (with or without the -mfpmath=sse - the default is 387): + +GCCFLAGS = -m64 -march=x86-64 -mfpmath=sse -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mssse3 -I${LD_INCLUDE_PATH} + +# Must use export because otherwise second-stage/programs/wlle/Makefile doesn't get the message. +export GCCFLAGS + +# CC = condor_compile gcc +CC = gcc +export CC + +# CXX = condor_compile g++ +CXX = g++ +export CXX + +# fast options +# Compilation help: you may need to remove -march=native on older compilers. +# GCCFLAGS=-march=native -mfpmath=sse -msse2 -mmmx +FOPENMP=-fopenmp +# CFLAGS=-MMD -O3 -ffast-math -fstrict-aliasing -Wall -finline-functions $(GCCFLAGS) $(FOPENMP) +# LDFLAGS=$(FOPENMP) -L/opt/local/lib + +# debugging options +# GCCFLAGS= +# FOPENMP= +# CFLAGS=-MMD -O0 -g $(GCCFLAGS) $(FOPENMP) +# LDFLAGS=-g $(FOPENMP) +# CXXFLAGS=${CFLAGS} -Wno-deprecated + +# CFLAGS is used for all C and C++ compilation +# +CFLAGS = -MMD -O3 -Wall -ffast-math -finline-functions -fomit-frame-pointer -fstrict-aliasing $(GCCFLAGS) +export CFLAGS + +CXXFLAGS=${CFLAGS} -Wno-deprecated +export CXXFLAGS + +LDFLAGS = -L${LD_LIBRARY_PATH} $(GCCLDFLAGS) +export LDFLAGS + +# This is a handy place to put a local setting without changing Makefile. +# PENNWSJTREEBANK = /usr/local/data/Penn3/parsed/mrg/wsj diff --git a/parse.sh b/parse.sh index 4c7a76c..0bf5668 100755 --- a/parse.sh +++ b/parse.sh @@ -14,5 +14,6 @@ # RERANKDATA=ec50-connll-ic-s5 # RERANKDATA=ec50-f050902-lics5 MODELDIR=second-stage/models/ec50spfinal -ESTIMATORNICKNAME=cvlm-l1c10P1 +# ESTIMATORNICKNAME=cvlm-l1c10P1 +ESTIMATORNICKNAME=lbfgs-l1c10F1n1p2 first-stage/PARSE/parseIt -l399 -N50 first-stage/DATA/EN/ $* | second-stage/programs/features/best-parses -l $MODELDIR/features.gz $MODELDIR/$ESTIMATORNICKNAME-weights.gz diff --git a/python/bllipparser/ParsingShell.py b/python/bllipparser/ParsingShell.py index 1b12737..61246d5 100644 --- a/python/bllipparser/ParsingShell.py +++ b/python/bllipparser/ParsingShell.py @@ -15,10 +15,13 @@ import nltk.tree try: import nltk.draw.tree + have_tree_drawing = False + read_nltk_tree = nltk.tree.Tree.fromstring have_tree_drawing = True - read_nltk_tree = nltk.tree.Tree.parse except ImportError: have_tree_drawing = False +except AttributeError: + have_tree_drawing = False from bllipparser.RerankingParser import RerankingParser diff --git a/second-stage/models/ec50spfinal/features.gz b/second-stage/models/ec50spfinal/features.gz index d24dc76..6dcbbf8 100644 Binary files a/second-stage/models/ec50spfinal/features.gz and b/second-stage/models/ec50spfinal/features.gz differ diff --git a/second-stage/programs/eval-beam/utility.h b/second-stage/programs/eval-beam/utility.h index 58db9f4..8cfe57a 100644 --- a/second-stage/programs/eval-beam/utility.h +++ b/second-stage/programs/eval-beam/utility.h @@ -891,24 +891,42 @@ inline std::ostream& operator<< (std::ostream& os, const boost::shared_ptr& s struct resource_usage { }; +#ifndef __i386 +#define NO_PROC_SELF_STAT +#endif + +#ifdef __APPLE__ +#define NO_PROC_SELF_STAT +#endif + +#ifdef NO_PROC_SELF_STAT +inline std::ostream& operator<< (std::ostream& os, resource_usage r) +{ + return os; +} +#else // Assume we are on a 586 linux inline std::ostream& operator<< (std::ostream& os, resource_usage r) { FILE* fp = fopen("/proc/self/stat", "r"); - assert(fp); - int utime; - int stime; - unsigned int vsize; - unsigned int rss; - int result = - fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" - "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); - assert(result == 4); - fclose(fp); - // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss -; - // return s << "utime = " << utime << ", vsize = " << vsize; - return os << "utime " << float(utime)/1.0e2 << "s, vsize " - << float(vsize)/1048576.0 << " Mb."; + // Don't fail if we can't read that (such as on a Mac), just return. + if (fp == NULL) { + return os; + } else { + int utime; + int stime; + unsigned int vsize; + unsigned int rss; + int result = + fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" + "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); + assert(result == 4); + fclose(fp); + // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss; + // return s << "utime = " << utime << ", vsize = " << vsize; + return os << "utime " << float(utime)/1.0e2 << "s, vsize " + << float(vsize)/1048576.0 << " Mb."; + } } +#endif #endif // UTILITY_H diff --git a/second-stage/programs/eval-weights/Makefile b/second-stage/programs/eval-weights/Makefile index b6dbd65..a5612f6 100644 --- a/second-stage/programs/eval-weights/Makefile +++ b/second-stage/programs/eval-weights/Makefile @@ -14,7 +14,7 @@ SOURCES = best-indices.cc best-parse.cc best-parses.cc compare-models.cc data.c TARGETS = eval-weights # best-indices best-parse best-parses compare-models pretty-print OBJECTS = $(patsubst %.l,%.o,$(patsubst %.c,%.o,$(SOURCES:%.cc=%.o))) -CC = gcc +CC ?= gcc all: $(TARGETS) diff --git a/second-stage/programs/eval-weights/utility.h b/second-stage/programs/eval-weights/utility.h index 45ef28e..7c816cd 100644 --- a/second-stage/programs/eval-weights/utility.h +++ b/second-stage/programs/eval-weights/utility.h @@ -882,6 +882,14 @@ inline std::ostream& operator<< (std::ostream& os, const boost::shared_ptr& s struct resource_usage { }; #ifndef __i386 +#define NO_PROC_SELF_STAT +#endif + +#ifdef __APPLE__ +#define NO_PROC_SELF_STAT +#endif + +#ifdef NO_PROC_SELF_STAT inline std::ostream& operator<< (std::ostream& os, resource_usage r) { return os; @@ -890,21 +898,24 @@ inline std::ostream& operator<< (std::ostream& os, resource_usage r) inline std::ostream& operator<< (std::ostream& os, resource_usage r) { FILE* fp = fopen("/proc/self/stat", "r"); - assert(fp); - int utime; - int stime; - unsigned int vsize; - unsigned int rss; - int result = - fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" - "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); - assert(result == 4); - fclose(fp); - // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss -; - // return s << "utime = " << utime << ", vsize = " << vsize; - return os << "utime " << float(utime)/1.0e2 << "s, vsize " - << float(vsize)/1048576.0 << " Mb."; + // Don't fail if we can't read that (such as on a Mac), just return. + if (fp == NULL) { + return os; + } else { + int utime; + int stime; + unsigned int vsize; + unsigned int rss; + int result = + fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" + "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); + assert(result == 4); + fclose(fp); + // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss; + // return s << "utime = " << utime << ", vsize = " << vsize; + return os << "utime " << float(utime)/1.0e2 << "s, vsize " + << float(vsize)/1048576.0 << " Mb."; + } } #endif diff --git a/second-stage/programs/features/utility.h b/second-stage/programs/features/utility.h index 58db9f4..8cfe57a 100644 --- a/second-stage/programs/features/utility.h +++ b/second-stage/programs/features/utility.h @@ -891,24 +891,42 @@ inline std::ostream& operator<< (std::ostream& os, const boost::shared_ptr& s struct resource_usage { }; +#ifndef __i386 +#define NO_PROC_SELF_STAT +#endif + +#ifdef __APPLE__ +#define NO_PROC_SELF_STAT +#endif + +#ifdef NO_PROC_SELF_STAT +inline std::ostream& operator<< (std::ostream& os, resource_usage r) +{ + return os; +} +#else // Assume we are on a 586 linux inline std::ostream& operator<< (std::ostream& os, resource_usage r) { FILE* fp = fopen("/proc/self/stat", "r"); - assert(fp); - int utime; - int stime; - unsigned int vsize; - unsigned int rss; - int result = - fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" - "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); - assert(result == 4); - fclose(fp); - // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss -; - // return s << "utime = " << utime << ", vsize = " << vsize; - return os << "utime " << float(utime)/1.0e2 << "s, vsize " - << float(vsize)/1048576.0 << " Mb."; + // Don't fail if we can't read that (such as on a Mac), just return. + if (fp == NULL) { + return os; + } else { + int utime; + int stime; + unsigned int vsize; + unsigned int rss; + int result = + fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" + "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); + assert(result == 4); + fclose(fp); + // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss; + // return s << "utime = " << utime << ", vsize = " << vsize; + return os << "utime " << float(utime)/1.0e2 << "s, vsize " + << float(vsize)/1048576.0 << " Mb."; + } } +#endif #endif // UTILITY_H diff --git a/second-stage/programs/prepare-data/utility.h b/second-stage/programs/prepare-data/utility.h index 45ef28e..7c816cd 100644 --- a/second-stage/programs/prepare-data/utility.h +++ b/second-stage/programs/prepare-data/utility.h @@ -882,6 +882,14 @@ inline std::ostream& operator<< (std::ostream& os, const boost::shared_ptr& s struct resource_usage { }; #ifndef __i386 +#define NO_PROC_SELF_STAT +#endif + +#ifdef __APPLE__ +#define NO_PROC_SELF_STAT +#endif + +#ifdef NO_PROC_SELF_STAT inline std::ostream& operator<< (std::ostream& os, resource_usage r) { return os; @@ -890,21 +898,24 @@ inline std::ostream& operator<< (std::ostream& os, resource_usage r) inline std::ostream& operator<< (std::ostream& os, resource_usage r) { FILE* fp = fopen("/proc/self/stat", "r"); - assert(fp); - int utime; - int stime; - unsigned int vsize; - unsigned int rss; - int result = - fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" - "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); - assert(result == 4); - fclose(fp); - // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss -; - // return s << "utime = " << utime << ", vsize = " << vsize; - return os << "utime " << float(utime)/1.0e2 << "s, vsize " - << float(vsize)/1048576.0 << " Mb."; + // Don't fail if we can't read that (such as on a Mac), just return. + if (fp == NULL) { + return os; + } else { + int utime; + int stime; + unsigned int vsize; + unsigned int rss; + int result = + fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" + "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); + assert(result == 4); + fclose(fp); + // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss; + // return s << "utime = " << utime << ", vsize = " << vsize; + return os << "utime " << float(utime)/1.0e2 << "s, vsize " + << float(vsize)/1048576.0 << " Mb."; + } } #endif diff --git a/second-stage/programs/wlle/Makefile b/second-stage/programs/wlle/Makefile index 75a803c..26cbd20 100644 --- a/second-stage/programs/wlle/Makefile +++ b/second-stage/programs/wlle/Makefile @@ -46,15 +46,15 @@ libdata.a: data.o liblmdata.a: lmdata.o ar rcv liblmdata.a lmdata.o; ranlib liblmdata.a -CC=gcc +# CC=gcc # fast options # Compilation help: you may need to remove -march=native on older compilers. -GCCFLAGS=-march=native -mfpmath=sse -msse2 -mmmx +# GCCFLAGS=-march=native -mfpmath=sse -msse2 -mmmx FOPENMP=-fopenmp -CFLAGS=-MMD -O3 -ffast-math -fstrict-aliasing -Wall -finline-functions $(GCCFLAGS) $(FOPENMP) -LDFLAGS=$(FOPENMP) -CXXFLAGS=${CFLAGS} -Wno-deprecated +# CFLAGS=-MMD -O3 -ffast-math -fstrict-aliasing -Wall -finline-functions $(GCCFLAGS) $(FOPENMP) +LDFLAGS+=$(FOPENMP) +# CXXFLAGS=${CFLAGS} -Wno-deprecated # debugging options # GCCFLAGS= diff --git a/second-stage/programs/wlle/utility.h b/second-stage/programs/wlle/utility.h index a6aa90c..19f48bc 100644 --- a/second-stage/programs/wlle/utility.h +++ b/second-stage/programs/wlle/utility.h @@ -894,6 +894,14 @@ inline std::ostream& operator<< (std::ostream& os, const boost::shared_ptr& s struct resource_usage { }; #ifndef __i386 +#define NO_PROC_SELF_STAT +#endif + +#ifdef __APPLE__ +#define NO_PROC_SELF_STAT +#endif + +#ifdef NO_PROC_SELF_STAT inline std::ostream& operator<< (std::ostream& os, resource_usage r) { return os; @@ -902,21 +910,24 @@ inline std::ostream& operator<< (std::ostream& os, resource_usage r) inline std::ostream& operator<< (std::ostream& os, resource_usage r) { FILE* fp = fopen("/proc/self/stat", "r"); - assert(fp); - int utime; - int stime; - unsigned int vsize; - unsigned int rss; - int result = - fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" - "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); - assert(result == 4); - fclose(fp); - // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss -; - // return s << "utime = " << utime << ", vsize = " << vsize; - return os << "utime " << float(utime)/1.0e2 << "s, vsize " - << float(vsize)/1048576.0 << " Mb."; + // Don't fail if we can't read that (such as on a Mac), just return. + if (fp == NULL) { + return os; + } else { + int utime; + int stime; + unsigned int vsize; + unsigned int rss; + int result = + fscanf(fp, "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %d %d %*d %*d %*d %*d" + "%*u %*u %*d %u %u", &utime, &stime, &vsize, &rss); + assert(result == 4); + fclose(fp); + // s << "utime = " << utime << ", stime = " << stime << ", vsize = " << vsize << ", rss = " << rss; + // return s << "utime = " << utime << ", vsize = " << vsize; + return os << "utime " << float(utime)/1.0e2 << "s, vsize " + << float(vsize)/1048576.0 << " Mb."; + } } #endif