Added files for glove
This commit is contained in:
Родитель
84ef7843c9
Коммит
13bf63fd1a
|
@ -0,0 +1,6 @@
|
|||
language: c
|
||||
dist: trusty
|
||||
sudo: required
|
||||
before_install:
|
||||
- sudo apt-get install python2.7 python-numpy python-pip
|
||||
script: pip install numpy && ./demo.sh | tee results.txt && [[ `cat results.txt | egrep "Total accuracy. 2[23]" | wc -l` = "1" ]] && echo test-passed
|
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,22 @@
|
|||
CC = gcc
|
||||
#For older gcc, use -O3 or -O2 instead of -Ofast
|
||||
# CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wno-unused-result
|
||||
CFLAGS = -lm -pthread -Ofast -march=native -funroll-loops -Wall -Wextra -Wpedantic
|
||||
BUILDDIR := build
|
||||
SRCDIR := src
|
||||
|
||||
all: dir glove shuffle cooccur vocab_count
|
||||
|
||||
dir :
|
||||
mkdir -p $(BUILDDIR)
|
||||
glove : $(SRCDIR)/glove.c
|
||||
$(CC) $(SRCDIR)/glove.c -o $(BUILDDIR)/glove $(CFLAGS)
|
||||
shuffle : $(SRCDIR)/shuffle.c
|
||||
$(CC) $(SRCDIR)/shuffle.c -o $(BUILDDIR)/shuffle $(CFLAGS)
|
||||
cooccur : $(SRCDIR)/cooccur.c
|
||||
$(CC) $(SRCDIR)/cooccur.c -o $(BUILDDIR)/cooccur $(CFLAGS)
|
||||
vocab_count : $(SRCDIR)/vocab_count.c
|
||||
$(CC) $(SRCDIR)/vocab_count.c -o $(BUILDDIR)/vocab_count $(CFLAGS)
|
||||
|
||||
clean:
|
||||
rm -rf glove shuffle cooccur vocab_count build
|
|
@ -0,0 +1,38 @@
|
|||
## GloVe: Global Vectors for Word Representation
|
||||
|
||||
|
||||
| nearest neighbors of <br/> <em>frog</em> | Litoria | Leptodactylidae | Rana | Eleutherodactylus |
|
||||
| --- | ------------------------------- | ------------------- | ---------------- | ------------------- |
|
||||
| Pictures | <img src="http://nlp.stanford.edu/projects/glove/images/litoria.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/leptodactylidae.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/rana.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/eleutherodactylus.jpg"></img> |
|
||||
|
||||
| Comparisons | man -> woman | city -> zip | comparative -> superlative |
|
||||
| --- | ------------------------|-------------------------|-------------------------|
|
||||
| GloVe Geometry | <img src="http://nlp.stanford.edu/projects/glove/images/man_woman_small.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/city_zip_small.jpg"></img> | <img src="http://nlp.stanford.edu/projects/glove/images/comparative_superlative_small.jpg"></img> |
|
||||
|
||||
We provide an implementation of the GloVe model for learning word representations, and describe how to download web-dataset vectors or train your own. See the [project page](http://nlp.stanford.edu/projects/glove/) or the [paper](http://nlp.stanford.edu/pubs/glove.pdf) for more information on glove vectors.
|
||||
|
||||
## Download pre-trained word vectors
|
||||
The links below contain word vectors obtained from the respective corpora. If you want word vectors trained on massive web datasets, you need only download one of these text files! Pre-trained word vectors are made available under the <a href="http://opendatacommons.org/licenses/pddl/">Public Domain Dedication and License</a>.
|
||||
<div class="entry">
|
||||
<ul style="padding-left:0px; margin-top:0px; margin-bottom:0px">
|
||||
<li> Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip">glove.42B.300d.zip</a> </li>
|
||||
<li> Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip">glove.840B.300d.zip</a> </li>
|
||||
<li> Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.6B.zip">glove.6B.zip</a> </li>
|
||||
<li> Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download): <a href="http://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip">glove.twitter.27B.zip</a>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
## Train word vectors on a new corpus
|
||||
|
||||
<img src="https://travis-ci.org/stanfordnlp/GloVe.svg?branch=master"></img>
|
||||
|
||||
If the web datasets above don't match the semantics of your end use case, you can train word vectors on your own corpus.
|
||||
|
||||
$ git clone http://github.com/stanfordnlp/glove
|
||||
$ cd glove && make
|
||||
$ ./demo.sh
|
||||
|
||||
The demo.sh script downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python to verify word vector quality. More details about training on your own corpus can be found by reading [demo.sh](https://github.com/stanfordnlp/GloVe/blob/master/demo.sh) or the [src/README.md](https://github.com/stanfordnlp/GloVe/tree/master/src)
|
||||
|
||||
### License
|
||||
All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file.
|
|
@ -0,0 +1,52 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
|
||||
# One optional argument can specify the language used for eval script: matlab, octave or [default] python
|
||||
|
||||
make
|
||||
if [ ! -e text8 ]; then
|
||||
if hash wget 2>/dev/null; then
|
||||
wget http://mattmahoney.net/dc/text8.zip
|
||||
else
|
||||
curl -O http://mattmahoney.net/dc/text8.zip
|
||||
fi
|
||||
unzip text8.zip
|
||||
rm text8.zip
|
||||
fi
|
||||
|
||||
CORPUS=text8
|
||||
VOCAB_FILE=vocab.txt
|
||||
COOCCURRENCE_FILE=cooccurrence.bin
|
||||
COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
|
||||
BUILDDIR=build
|
||||
SAVE_FILE=vectors
|
||||
VERBOSE=2
|
||||
MEMORY=4.0
|
||||
VOCAB_MIN_COUNT=5
|
||||
VECTOR_SIZE=50
|
||||
MAX_ITER=15
|
||||
WINDOW_SIZE=15
|
||||
BINARY=2
|
||||
NUM_THREADS=8
|
||||
X_MAX=10
|
||||
|
||||
echo
|
||||
echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
|
||||
$BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
|
||||
echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
|
||||
$BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
|
||||
echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
|
||||
$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
|
||||
echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
|
||||
$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
|
||||
if [ "$CORPUS" = 'text8' ]; then
|
||||
if [ "$1" = 'matlab' ]; then
|
||||
matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2
|
||||
elif [ "$1" = 'octave' ]; then
|
||||
octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
|
||||
else
|
||||
echo "$ python eval/python/evaluate.py"
|
||||
python eval/python/evaluate.py
|
||||
fi
|
||||
fi
|
|
@ -0,0 +1,17 @@
|
|||
### Package Contents
|
||||
|
||||
To train your own GloVe vectors, first you'll need to prepare your corpus as a single text file with all words separated by one or more spaces or tabs. If your corpus has multiple documents, the documents (only) should be separated by new line characters. Cooccurrence contexts for words do not extend past newline characters. Once you create your corpus, you can train GloVe vectors using the following 4 tools. An example is included in `demo.sh`, which you can modify as necessary.
|
||||
|
||||
The four main tools in this package are:
|
||||
|
||||
#### 1) vocab_count
|
||||
This tool requires an input corpus that should already consist of whitespace-separated tokens. Use something like the [Stanford Tokenizer](https://nlp.stanford.edu/software/tokenizer.html) first on raw text. From the corpus, it constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count.
|
||||
|
||||
#### 2) cooccur
|
||||
Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by `vocab_count`, and may specify a variety of parameters, as described by running `./build/cooccur`.
|
||||
|
||||
#### 3) shuffle
|
||||
Shuffles the binary file of cooccurrence statistics produced by `cooccur`. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled together. The user may specify a number of parameters, as described by running `./build/shuffle`.
|
||||
|
||||
#### 4) glove
|
||||
Train the GloVe model on the specified cooccurrence data, which typically will be the output of the `shuffle` tool. The user should supply a vocabulary file, as given by `vocab_count`, and may specify a number of other parameters, which are described by running `./build/glove`.
|
|
@ -0,0 +1,522 @@
|
|||
// Tool to calculate word-word cooccurrence statistics
|
||||
//
|
||||
// Copyright (c) 2014, 2018 The Board of Trustees of
|
||||
// The Leland Stanford Junior University. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
//
|
||||
// For more information, bug reports, fixes, contact:
|
||||
// Jeffrey Pennington (jpennin@stanford.edu)
|
||||
// Christopher Manning (manning@cs.stanford.edu)
|
||||
// https://github.com/stanfordnlp/GloVe/
|
||||
// GlobalVectors@googlegroups.com
|
||||
// http://nlp.stanford.edu/projects/glove/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#define MAX_STRING_LENGTH 1000
|
||||
#define TSIZE 1048576
|
||||
#define SEED 1159241
|
||||
|
||||
#define HASHFN bitwisehash
|
||||
|
||||
typedef double real;
|
||||
|
||||
typedef struct cooccur_rec {
|
||||
int word1;
|
||||
int word2;
|
||||
real val;
|
||||
} CREC;
|
||||
|
||||
typedef struct cooccur_rec_id {
|
||||
int word1;
|
||||
int word2;
|
||||
real val;
|
||||
int id;
|
||||
} CRECID;
|
||||
|
||||
typedef struct hashrec {
|
||||
char *word;
|
||||
long long id;
|
||||
struct hashrec *next;
|
||||
} HASHREC;
|
||||
|
||||
int verbose = 2; // 0, 1, or 2
|
||||
long long max_product; // Cutoff for product of word frequency ranks below which cooccurrence counts will be stored in a compressed full array
|
||||
long long overflow_length; // Number of cooccurrence records whose product exceeds max_product to store in memory before writing to disk
|
||||
int window_size = 15; // default context window size
|
||||
int symmetric = 1; // 0: asymmetric, 1: symmetric
|
||||
real memory_limit = 3; // soft limit, in gigabytes, used to estimate optimal array sizes
|
||||
int distance_weighting = 1; // Flag to control the distance weighting of cooccurrence counts
|
||||
char *vocab_file, *file_head;
|
||||
|
||||
/* Efficient string comparison */
|
||||
int scmp( char *s1, char *s2 ) {
|
||||
while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
|
||||
return(*s1 - *s2);
|
||||
}
|
||||
|
||||
/* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */
|
||||
|
||||
/* Simple bitwise hash function */
|
||||
unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
|
||||
char c;
|
||||
unsigned int h;
|
||||
h = seed;
|
||||
for (; (c =* word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
|
||||
return((unsigned int)((h&0x7fffffff) % tsize));
|
||||
}
|
||||
|
||||
/* Create hash table, initialise pointers to NULL */
|
||||
HASHREC ** inithashtable() {
|
||||
int i;
|
||||
HASHREC **ht;
|
||||
ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
|
||||
for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;
|
||||
return(ht);
|
||||
}
|
||||
|
||||
/* Search hash table for given string, return record if found, else NULL */
|
||||
HASHREC *hashsearch(HASHREC **ht, char *w) {
|
||||
HASHREC *htmp, *hprv;
|
||||
unsigned int hval = HASHFN(w, TSIZE, SEED);
|
||||
for (hprv = NULL, htmp=ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
|
||||
if ( htmp != NULL && hprv!=NULL ) { // move to front on access
|
||||
hprv->next = htmp->next;
|
||||
htmp->next = ht[hval];
|
||||
ht[hval] = htmp;
|
||||
}
|
||||
return(htmp);
|
||||
}
|
||||
|
||||
/* Insert string in hash table, check for duplicates which should be absent */
|
||||
void hashinsert(HASHREC **ht, char *w, long long id) {
|
||||
HASHREC *htmp, *hprv;
|
||||
unsigned int hval = HASHFN(w, TSIZE, SEED);
|
||||
for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
|
||||
if (htmp == NULL) {
|
||||
htmp = (HASHREC *) malloc(sizeof(HASHREC));
|
||||
htmp->word = (char *) malloc(strlen(w) + 1);
|
||||
strcpy(htmp->word, w);
|
||||
htmp->id = id;
|
||||
htmp->next = NULL;
|
||||
if (hprv == NULL) ht[hval] = htmp;
|
||||
else hprv->next = htmp;
|
||||
}
|
||||
else fprintf(stderr, "Error, duplicate entry located: %s.\n",htmp->word);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Read word from input stream. Return 1 when encounter '\n' or EOF (but separate from word), 0 otherwise.
|
||||
Words can be separated by space(s), tab(s), or newline(s). Carriage return characters are just ignored.
|
||||
(Okay for Windows, but not for Mac OS 9-. Ignored even if by themselves or in words.)
|
||||
A newline is taken as indicating a new document (contexts won't cross newline).
|
||||
Argument word array is assumed to be of size MAX_STRING_LENGTH.
|
||||
words will be truncated if too long. They are truncated with some care so that they
|
||||
cannot truncate in the middle of a utf-8 character, but
|
||||
still little to no harm will be done for other encodings like iso-8859-1.
|
||||
(This function appears identically copied in vocab_count.c and cooccur.c.)
|
||||
*/
|
||||
int get_word(char *word, FILE *fin) {
|
||||
int i = 0, ch;
|
||||
for ( ; ; ) {
|
||||
ch = fgetc(fin);
|
||||
if (ch == '\r') continue;
|
||||
if (i == 0 && ((ch == '\n') || (ch == EOF))) {
|
||||
word[i] = 0;
|
||||
return 1;
|
||||
}
|
||||
if (i == 0 && ((ch == ' ') || (ch == '\t'))) continue; // skip leading space
|
||||
if ((ch == EOF) || (ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
||||
if (ch == '\n') ungetc(ch, fin); // return the newline next time as document ender
|
||||
break;
|
||||
}
|
||||
if (i < MAX_STRING_LENGTH - 1)
|
||||
word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH
|
||||
}
|
||||
word[i] = 0; //null terminate
|
||||
// avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0])
|
||||
// see https://en.wikipedia.org/wiki/UTF-8#Description
|
||||
if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) {
|
||||
if ((word[i-1] & 0xC0) == 0xC0) {
|
||||
word[i-1] = '\0';
|
||||
} else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) {
|
||||
word[i-2] = '\0';
|
||||
} else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) {
|
||||
word[i-3] = '\0';
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Write sorted chunk of cooccurrence records to file, accumulating duplicate entries */
|
||||
int write_chunk(CREC *cr, long long length, FILE *fout) {
|
||||
if (length == 0) return 0;
|
||||
|
||||
long long a = 0;
|
||||
CREC old = cr[a];
|
||||
|
||||
for (a = 1; a < length; a++) {
|
||||
if (cr[a].word1 == old.word1 && cr[a].word2 == old.word2) {
|
||||
old.val += cr[a].val;
|
||||
continue;
|
||||
}
|
||||
fwrite(&old, sizeof(CREC), 1, fout);
|
||||
old = cr[a];
|
||||
}
|
||||
fwrite(&old, sizeof(CREC), 1, fout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Check if two cooccurrence records are for the same two words, used for qsort */
|
||||
int compare_crec(const void *a, const void *b) {
|
||||
int c;
|
||||
if ( (c = ((CREC *) a)->word1 - ((CREC *) b)->word1) != 0) return c;
|
||||
else return (((CREC *) a)->word2 - ((CREC *) b)->word2);
|
||||
|
||||
}
|
||||
|
||||
/* Check if two cooccurrence records are for the same two words */
|
||||
int compare_crecid(CRECID a, CRECID b) {
|
||||
int c;
|
||||
if ( (c = a.word1 - b.word1) != 0) return c;
|
||||
else return a.word2 - b.word2;
|
||||
}
|
||||
|
||||
/* Swap two entries of priority queue */
|
||||
void swap_entry(CRECID *pq, int i, int j) {
|
||||
CRECID temp = pq[i];
|
||||
pq[i] = pq[j];
|
||||
pq[j] = temp;
|
||||
}
|
||||
|
||||
/* Insert entry into priority queue */
|
||||
void insert(CRECID *pq, CRECID new, int size) {
|
||||
int j = size - 1, p;
|
||||
pq[j] = new;
|
||||
while ( (p=(j-1)/2) >= 0 ) {
|
||||
if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); j = p;}
|
||||
else break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Delete entry from priority queue */
|
||||
void delete(CRECID *pq, int size) {
|
||||
int j, p = 0;
|
||||
pq[p] = pq[size - 1];
|
||||
while ( (j = 2*p+1) < size - 1 ) {
|
||||
if (j == size - 2) {
|
||||
if (compare_crecid(pq[p],pq[j]) > 0) swap_entry(pq,p,j);
|
||||
return;
|
||||
}
|
||||
else {
|
||||
if (compare_crecid(pq[j], pq[j+1]) < 0) {
|
||||
if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); p = j;}
|
||||
else return;
|
||||
}
|
||||
else {
|
||||
if (compare_crecid(pq[p],pq[j+1]) > 0) {swap_entry(pq,p,j+1); p = j + 1;}
|
||||
else return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Write top node of priority queue to file, accumulating duplicate entries */
|
||||
int merge_write(CRECID new, CRECID *old, FILE *fout) {
|
||||
if (new.word1 == old->word1 && new.word2 == old->word2) {
|
||||
old->val += new.val;
|
||||
return 0; // Indicates duplicate entry
|
||||
}
|
||||
fwrite(old, sizeof(CREC), 1, fout);
|
||||
*old = new;
|
||||
return 1; // Actually wrote to file
|
||||
}
|
||||
|
||||
/* Merge [num] sorted files of cooccurrence records */
|
||||
int merge_files(int num) {
|
||||
int i, size;
|
||||
long long counter = 0;
|
||||
CRECID *pq, new, old;
|
||||
char filename[200];
|
||||
FILE **fid, *fout;
|
||||
fid = malloc(sizeof(FILE) * num);
|
||||
pq = malloc(sizeof(CRECID) * num);
|
||||
fout = stdout;
|
||||
if (verbose > 1) fprintf(stderr, "Merging cooccurrence files: processed 0 lines.");
|
||||
|
||||
/* Open all files and add first entry of each to priority queue */
|
||||
for (i = 0; i < num; i++) {
|
||||
sprintf(filename,"%s_%04d.bin",file_head,i);
|
||||
fid[i] = fopen(filename,"rb");
|
||||
if (fid[i] == NULL) {fprintf(stderr, "Unable to open file %s.\n",filename); return 1;}
|
||||
fread(&new, sizeof(CREC), 1, fid[i]);
|
||||
new.id = i;
|
||||
insert(pq,new,i+1);
|
||||
}
|
||||
|
||||
/* Pop top node, save it in old to see if the next entry is a duplicate */
|
||||
size = num;
|
||||
old = pq[0];
|
||||
i = pq[0].id;
|
||||
delete(pq, size);
|
||||
fread(&new, sizeof(CREC), 1, fid[i]);
|
||||
if (feof(fid[i])) size--;
|
||||
else {
|
||||
new.id = i;
|
||||
insert(pq, new, size);
|
||||
}
|
||||
|
||||
/* Repeatedly pop top node and fill priority queue until files have reached EOF */
|
||||
while (size > 0) {
|
||||
counter += merge_write(pq[0], &old, fout); // Only count the lines written to file, not duplicates
|
||||
if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[39G%lld lines.",counter);
|
||||
i = pq[0].id;
|
||||
delete(pq, size);
|
||||
fread(&new, sizeof(CREC), 1, fid[i]);
|
||||
if (feof(fid[i])) size--;
|
||||
else {
|
||||
new.id = i;
|
||||
insert(pq, new, size);
|
||||
}
|
||||
}
|
||||
fwrite(&old, sizeof(CREC), 1, fout);
|
||||
fprintf(stderr,"\033[0GMerging cooccurrence files: processed %lld lines.\n",++counter);
|
||||
for (i=0;i<num;i++) {
|
||||
sprintf(filename,"%s_%04d.bin",file_head,i);
|
||||
remove(filename);
|
||||
}
|
||||
fprintf(stderr,"\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Collect word-word cooccurrence counts from input stream */
|
||||
int get_cooccurrence() {
|
||||
int flag, x, y, fidcounter = 1;
|
||||
long long a, j = 0, k, id, counter = 0, ind = 0, vocab_size, w1, w2, *lookup, *history;
|
||||
char format[20], filename[200], str[MAX_STRING_LENGTH + 1];
|
||||
FILE *fid, *foverflow;
|
||||
real *bigram_table, r;
|
||||
HASHREC *htmp, **vocab_hash = inithashtable();
|
||||
CREC *cr = malloc(sizeof(CREC) * (overflow_length + 1));
|
||||
history = malloc(sizeof(long long) * window_size);
|
||||
|
||||
fprintf(stderr, "COUNTING COOCCURRENCES\n");
|
||||
if (verbose > 0) {
|
||||
fprintf(stderr, "window size: %d\n", window_size);
|
||||
if (symmetric == 0) fprintf(stderr, "context: asymmetric\n");
|
||||
else fprintf(stderr, "context: symmetric\n");
|
||||
}
|
||||
if (verbose > 1) fprintf(stderr, "max product: %lld\n", max_product);
|
||||
if (verbose > 1) fprintf(stderr, "overflow length: %lld\n", overflow_length);
|
||||
sprintf(format,"%%%ds %%lld", MAX_STRING_LENGTH); // Format to read from vocab file, which has (irrelevant) frequency data
|
||||
if (verbose > 1) fprintf(stderr, "Reading vocab from file \"%s\"...", vocab_file);
|
||||
fid = fopen(vocab_file,"r");
|
||||
if (fid == NULL) {fprintf(stderr,"Unable to open vocab file %s.\n",vocab_file); return 1;}
|
||||
while (fscanf(fid, format, str, &id) != EOF) hashinsert(vocab_hash, str, ++j); // Here id is not used: inserting vocab words into hash table with their frequency rank, j
|
||||
fclose(fid);
|
||||
vocab_size = j;
|
||||
j = 0;
|
||||
if (verbose > 1) fprintf(stderr, "loaded %lld words.\nBuilding lookup table...", vocab_size);
|
||||
|
||||
/* Build auxiliary lookup table used to index into bigram_table */
|
||||
lookup = (long long *)calloc( vocab_size + 1, sizeof(long long) );
|
||||
if (lookup == NULL) {
|
||||
fprintf(stderr, "Couldn't allocate memory!");
|
||||
return 1;
|
||||
}
|
||||
lookup[0] = 1;
|
||||
for (a = 1; a <= vocab_size; a++) {
|
||||
if ((lookup[a] = max_product / a) < vocab_size) lookup[a] += lookup[a-1];
|
||||
else lookup[a] = lookup[a-1] + vocab_size;
|
||||
}
|
||||
if (verbose > 1) fprintf(stderr, "table contains %lld elements.\n",lookup[a-1]);
|
||||
|
||||
/* Allocate memory for full array which will store all cooccurrence counts for words whose product of frequency ranks is less than max_product */
|
||||
bigram_table = (real *)calloc( lookup[a-1] , sizeof(real) );
|
||||
if (bigram_table == NULL) {
|
||||
fprintf(stderr, "Couldn't allocate memory!");
|
||||
return 1;
|
||||
}
|
||||
|
||||
fid = stdin;
|
||||
// sprintf(format,"%%%ds",MAX_STRING_LENGTH);
|
||||
sprintf(filename,"%s_%04d.bin", file_head, fidcounter);
|
||||
foverflow = fopen(filename,"wb");
|
||||
if (verbose > 1) fprintf(stderr,"Processing token: 0");
|
||||
|
||||
/* For each token in input stream, calculate a weighted cooccurrence sum within window_size */
|
||||
while (1) {
|
||||
if (ind >= overflow_length - window_size) { // If overflow buffer is (almost) full, sort it and write it to temporary file
|
||||
qsort(cr, ind, sizeof(CREC), compare_crec);
|
||||
write_chunk(cr,ind,foverflow);
|
||||
fclose(foverflow);
|
||||
fidcounter++;
|
||||
sprintf(filename,"%s_%04d.bin",file_head,fidcounter);
|
||||
foverflow = fopen(filename,"wb");
|
||||
ind = 0;
|
||||
}
|
||||
flag = get_word(str, fid);
|
||||
if (verbose > 2) fprintf(stderr, "Maybe processing token: %s\n", str);
|
||||
if (flag == 1) {
|
||||
// Newline, reset line index (j); maybe eof.
|
||||
if (feof(fid)) {
|
||||
if (verbose > 2) fprintf(stderr, "Not getting coocurs as at eof\n");
|
||||
break;
|
||||
}
|
||||
j = 0;
|
||||
if (verbose > 2) fprintf(stderr, "Not getting coocurs as at newline\n");
|
||||
continue;
|
||||
}
|
||||
counter++;
|
||||
if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[19G%lld",counter);
|
||||
htmp = hashsearch(vocab_hash, str);
|
||||
if (htmp == NULL) {
|
||||
if (verbose > 2) fprintf(stderr, "Not getting coocurs as word not in vocab\n");
|
||||
continue; // Skip out-of-vocabulary words
|
||||
}
|
||||
w2 = htmp->id; // Target word (frequency rank)
|
||||
for (k = j - 1; k >= ( (j > window_size) ? j - window_size : 0 ); k--) { // Iterate over all words to the left of target word, but not past beginning of line
|
||||
w1 = history[k % window_size]; // Context word (frequency rank)
|
||||
if (verbose > 2) fprintf(stderr, "Adding cooccur between words %lld and %lld.\n", w1, w2);
|
||||
if ( w1 < max_product/w2 ) { // Product is small enough to store in a full array
|
||||
bigram_table[lookup[w1-1] + w2 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // Weight by inverse of distance between words if needed
|
||||
if (symmetric > 0) bigram_table[lookup[w2-1] + w1 - 2] += distance_weighting ? 1.0/((real)(j-k)) : 1.0; // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)
|
||||
}
|
||||
else { // Product is too big, data is likely to be sparse. Store these entries in a temporary buffer to be sorted, merged (accumulated), and written to file when it gets full.
|
||||
cr[ind].word1 = w1;
|
||||
cr[ind].word2 = w2;
|
||||
cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0;
|
||||
ind++; // Keep track of how full temporary buffer is
|
||||
if (symmetric > 0) { // Symmetric context
|
||||
cr[ind].word1 = w2;
|
||||
cr[ind].word2 = w1;
|
||||
cr[ind].val = distance_weighting ? 1.0/((real)(j-k)) : 1.0;
|
||||
ind++;
|
||||
}
|
||||
}
|
||||
}
|
||||
history[j % window_size] = w2; // Target word is stored in circular buffer to become context word in the future
|
||||
j++;
|
||||
}
|
||||
|
||||
/* Write out temp buffer for the final time (it may not be full) */
|
||||
if (verbose > 1) fprintf(stderr,"\033[0GProcessed %lld tokens.\n",counter);
|
||||
qsort(cr, ind, sizeof(CREC), compare_crec);
|
||||
write_chunk(cr,ind,foverflow);
|
||||
sprintf(filename,"%s_0000.bin",file_head);
|
||||
|
||||
/* Write out full bigram_table, skipping zeros */
|
||||
if (verbose > 1) fprintf(stderr, "Writing cooccurrences to disk");
|
||||
fid = fopen(filename,"wb");
|
||||
j = 1e6;
|
||||
for (x = 1; x <= vocab_size; x++) {
|
||||
if ( (long long) (0.75*log(vocab_size / x)) < j) {
|
||||
j = (long long) (0.75*log(vocab_size / x));
|
||||
if (verbose > 1) fprintf(stderr,".");
|
||||
} // log's to make it look (sort of) pretty
|
||||
for (y = 1; y <= (lookup[x] - lookup[x-1]); y++) {
|
||||
if ((r = bigram_table[lookup[x-1] - 2 + y]) != 0) {
|
||||
fwrite(&x, sizeof(int), 1, fid);
|
||||
fwrite(&y, sizeof(int), 1, fid);
|
||||
fwrite(&r, sizeof(real), 1, fid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose > 1) fprintf(stderr,"%d files in total.\n",fidcounter + 1);
|
||||
fclose(fid);
|
||||
fclose(foverflow);
|
||||
free(cr);
|
||||
free(lookup);
|
||||
free(bigram_table);
|
||||
free(vocab_hash);
|
||||
return merge_files(fidcounter + 1); // Merge the sorted temporary files
|
||||
}
|
||||
|
||||
int find_arg(char *str, int argc, char **argv) {
|
||||
int i;
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (!scmp(str, argv[i])) {
|
||||
if (i == argc - 1) {
|
||||
printf("No argument given for %s\n", str);
|
||||
exit(1);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
real rlimit, n = 1e5;
|
||||
vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
file_head = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
|
||||
if (argc == 1) {
|
||||
printf("Tool to calculate word-word cooccurrence statistics\n");
|
||||
printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
|
||||
printf("Usage options:\n");
|
||||
printf("\t-verbose <int>\n");
|
||||
printf("\t\tSet verbosity: 0, 1, 2 (default), or 3\n");
|
||||
printf("\t-symmetric <int>\n");
|
||||
printf("\t\tIf <int> = 0, only use left context; if <int> = 1 (default), use left and right\n");
|
||||
printf("\t-window-size <int>\n");
|
||||
printf("\t\tNumber of context words to the left (and to the right, if symmetric = 1); default 15\n");
|
||||
printf("\t-vocab-file <file>\n");
|
||||
printf("\t\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\n");
|
||||
printf("\t-memory <float>\n");
|
||||
printf("\t\tSoft limit for memory consumption, in GB -- based on simple heuristic, so not extremely accurate; default 4.0\n");
|
||||
printf("\t-max-product <int>\n");
|
||||
printf("\t\tLimit the size of dense cooccurrence array by specifying the max product <int> of the frequency counts of the two cooccurring words.\n\t\tThis value overrides that which is automatically produced by '-memory'. Typically only needs adjustment for use with very large corpora.\n");
|
||||
printf("\t-overflow-length <int>\n");
|
||||
printf("\t\tLimit to length <int> the sparse overflow array, which buffers cooccurrence data that does not fit in the dense array, before writing to disk. \n\t\tThis value overrides that which is automatically produced by '-memory'. Typically only needs adjustment for use with very large corpora.\n");
|
||||
printf("\t-overflow-file <file>\n");
|
||||
printf("\t\tFilename, excluding extension, for temporary files; default overflow\n");
|
||||
printf("\t-distance-weighting <int>\n");
|
||||
printf("\t\tIf <int> = 0, do not weight cooccurrence count by distance between words; if <int> = 1 (default), weight the cooccurrence count by inverse of distance between words\n");
|
||||
|
||||
printf("\nExample usage:\n");
|
||||
printf("./cooccur -verbose 2 -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < corpus.txt > cooccurrences.bin\n\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-symmetric", argc, argv)) > 0) symmetric = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-window-size", argc, argv)) > 0) window_size = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-vocab-file", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]);
|
||||
else strcpy(vocab_file, (char *)"vocab.txt");
|
||||
if ((i = find_arg((char *)"-overflow-file", argc, argv)) > 0) strcpy(file_head, argv[i + 1]);
|
||||
else strcpy(file_head, (char *)"overflow");
|
||||
if ((i = find_arg((char *)"-memory", argc, argv)) > 0) memory_limit = atof(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-distance-weighting", argc, argv)) > 0) distance_weighting = atoi(argv[i + 1]);
|
||||
|
||||
/* The memory_limit determines a limit on the number of elements in bigram_table and the overflow buffer */
|
||||
/* Estimate the maximum value that max_product can take so that this limit is still satisfied */
|
||||
rlimit = 0.85 * (real)memory_limit * 1073741824/(sizeof(CREC));
|
||||
while (fabs(rlimit - n * (log(n) + 0.1544313298)) > 1e-3) n = rlimit / (log(n) + 0.1544313298);
|
||||
max_product = (long long) n;
|
||||
overflow_length = (long long) rlimit/6; // 0.85 + 1/6 ~= 1
|
||||
|
||||
/* Override estimates by specifying limits explicitly on the command line */
|
||||
if ((i = find_arg((char *)"-max-product", argc, argv)) > 0) max_product = atoll(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-overflow-length", argc, argv)) > 0) overflow_length = atoll(argv[i + 1]);
|
||||
|
||||
return get_cooccurrence();
|
||||
}
|
||||
|
|
@ -0,0 +1,459 @@
|
|||
// GloVe: Global Vectors for Word Representation
|
||||
//
|
||||
// Copyright (c) 2014 The Board of Trustees of
|
||||
// The Leland Stanford Junior University. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
//
|
||||
// For more information, bug reports, fixes, contact:
|
||||
// Jeffrey Pennington (jpennin@stanford.edu)
|
||||
// GlobalVectors@googlegroups.com
|
||||
// http://nlp.stanford.edu/projects/glove/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <pthread.h>
|
||||
#include <time.h>
|
||||
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
#define MAX_STRING_LENGTH 1000
|
||||
|
||||
typedef double real;
|
||||
|
||||
typedef struct cooccur_rec {
|
||||
int word1;
|
||||
int word2;
|
||||
real val;
|
||||
} CREC;
|
||||
|
||||
int write_header=0; //0=no, 1=yes; writes vocab_size/vector_size as first line for use with some libraries, such as gensim.
|
||||
int verbose = 2; // 0, 1, or 2
|
||||
int use_unk_vec = 1; // 0 or 1
|
||||
int num_threads = 8; // pthreads
|
||||
int num_iter = 25; // Number of full passes through cooccurrence matrix
|
||||
int vector_size = 50; // Word vector size
|
||||
int save_gradsq = 0; // By default don't save squared gradient values
|
||||
int use_binary = 0; // 0: save as text files; 1: save as binary; 2: both. For binary, save both word and context word vectors.
|
||||
int model = 2; // For text file output only. 0: concatenate word and context vectors (and biases) i.e. save everything; 1: Just save word vectors (no bias); 2: Save (word + context word) vectors (no biases)
|
||||
int checkpoint_every = 0; // checkpoint the model for every checkpoint_every iterations. Do nothing if checkpoint_every <= 0
|
||||
real eta = 0.05; // Initial learning rate
|
||||
real alpha = 0.75, x_max = 100.0; // Weighting function parameters, not extremely sensitive to corpus, though may need adjustment for very small or very large corpora
|
||||
real *W, *gradsq, *cost;
|
||||
long long num_lines, *lines_per_thread, vocab_size;
|
||||
char *vocab_file, *input_file, *save_W_file, *save_gradsq_file;
|
||||
|
||||
/* Efficient string comparison */
|
||||
int scmp( char *s1, char *s2 ) {
|
||||
while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
|
||||
return(*s1 - *s2);
|
||||
}
|
||||
|
||||
void initialize_parameters() {
|
||||
long long a, b;
|
||||
vector_size++; // Temporarily increment to allocate space for bias
|
||||
|
||||
/* Allocate space for word vectors and context word vectors, and correspodning gradsq */
|
||||
a = posix_memalign((void **)&W, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc
|
||||
if (W == NULL) {
|
||||
fprintf(stderr, "Error allocating memory for W\n");
|
||||
exit(1);
|
||||
}
|
||||
a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * vector_size * sizeof(real)); // Might perform better than malloc
|
||||
if (gradsq == NULL) {
|
||||
fprintf(stderr, "Error allocating memory for gradsq\n");
|
||||
exit(1);
|
||||
}
|
||||
for (b = 0; b < vector_size; b++) {
|
||||
for (a = 0; a < 2 * vocab_size; a++) {
|
||||
W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;
|
||||
}
|
||||
}
|
||||
for (b = 0; b < vector_size; b++) {
|
||||
for (a = 0; a < 2 * vocab_size; a++) {
|
||||
gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate
|
||||
}
|
||||
}
|
||||
vector_size--;
|
||||
}
|
||||
|
||||
inline real check_nan(real update) {
|
||||
if (isnan(update) || isinf(update)) {
|
||||
fprintf(stderr,"\ncaught NaN in update");
|
||||
return 0.;
|
||||
} else {
|
||||
return update;
|
||||
}
|
||||
}
|
||||
|
||||
/* Train the GloVe model */
|
||||
void *glove_thread(void *vid) {
|
||||
long long a, b ,l1, l2;
|
||||
long long id = *(long long*)vid;
|
||||
CREC cr;
|
||||
real diff, fdiff, temp1, temp2;
|
||||
FILE *fin;
|
||||
fin = fopen(input_file, "rb");
|
||||
fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file
|
||||
cost[id] = 0;
|
||||
|
||||
real* W_updates1 = (real*)malloc(vector_size * sizeof(real));
|
||||
real* W_updates2 = (real*)malloc(vector_size * sizeof(real));
|
||||
for (a = 0; a < lines_per_thread[id]; a++) {
|
||||
fread(&cr, sizeof(CREC), 1, fin);
|
||||
if (feof(fin)) break;
|
||||
if (cr.word1 < 1 || cr.word2 < 1) { continue; }
|
||||
|
||||
/* Get location of words in W & gradsq */
|
||||
l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1
|
||||
l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words
|
||||
|
||||
/* Calculate cost, save diff for gradients */
|
||||
diff = 0;
|
||||
for (b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector
|
||||
diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word
|
||||
fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff
|
||||
|
||||
// Check for NaN and inf() in the diffs.
|
||||
if (isnan(diff) || isnan(fdiff) || isinf(diff) || isinf(fdiff)) {
|
||||
fprintf(stderr,"Caught NaN in diff for kdiff for thread. Skipping update");
|
||||
continue;
|
||||
}
|
||||
|
||||
cost[id] += 0.5 * fdiff * diff; // weighted squared error
|
||||
|
||||
/* Adaptive gradient updates */
|
||||
fdiff *= eta; // for ease in calculating gradient
|
||||
real W_updates1_sum = 0;
|
||||
real W_updates2_sum = 0;
|
||||
for (b = 0; b < vector_size; b++) {
|
||||
// learning rate times gradient for word vectors
|
||||
temp1 = fdiff * W[b + l2];
|
||||
temp2 = fdiff * W[b + l1];
|
||||
// adaptive updates
|
||||
W_updates1[b] = temp1 / sqrt(gradsq[b + l1]);
|
||||
W_updates2[b] = temp2 / sqrt(gradsq[b + l2]);
|
||||
W_updates1_sum += W_updates1[b];
|
||||
W_updates2_sum += W_updates2[b];
|
||||
gradsq[b + l1] += temp1 * temp1;
|
||||
gradsq[b + l2] += temp2 * temp2;
|
||||
}
|
||||
if (!isnan(W_updates1_sum) && !isinf(W_updates1_sum) && !isnan(W_updates2_sum) && !isinf(W_updates2_sum)) {
|
||||
for (b = 0; b < vector_size; b++) {
|
||||
W[b + l1] -= W_updates1[b];
|
||||
W[b + l2] -= W_updates2[b];
|
||||
}
|
||||
}
|
||||
|
||||
// updates for bias terms
|
||||
W[vector_size + l1] -= check_nan(fdiff / sqrt(gradsq[vector_size + l1]));
|
||||
W[vector_size + l2] -= check_nan(fdiff / sqrt(gradsq[vector_size + l2]));
|
||||
fdiff *= fdiff;
|
||||
gradsq[vector_size + l1] += fdiff;
|
||||
gradsq[vector_size + l2] += fdiff;
|
||||
|
||||
}
|
||||
free(W_updates1);
|
||||
free(W_updates2);
|
||||
|
||||
fclose(fin);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
/* Save params to file */
|
||||
int save_params(int nb_iter) {
|
||||
/*
|
||||
* nb_iter is the number of iteration (= a full pass through the cooccurrence matrix).
|
||||
* nb_iter > 0 => checkpointing the intermediate parameters, so nb_iter is in the filename of output file.
|
||||
* else => saving the final paramters, so nb_iter is ignored.
|
||||
*/
|
||||
|
||||
long long a, b;
|
||||
char format[20];
|
||||
char output_file[MAX_STRING_LENGTH], output_file_gsq[MAX_STRING_LENGTH];
|
||||
char *word = malloc(sizeof(char) * MAX_STRING_LENGTH + 1);
|
||||
FILE *fid, *fout, *fgs;
|
||||
|
||||
if (use_binary > 0) { // Save parameters in binary file
|
||||
if (nb_iter <= 0)
|
||||
sprintf(output_file,"%s.bin",save_W_file);
|
||||
else
|
||||
sprintf(output_file,"%s.%03d.bin",save_W_file,nb_iter);
|
||||
|
||||
fout = fopen(output_file,"wb");
|
||||
if (fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
|
||||
for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout);
|
||||
fclose(fout);
|
||||
if (save_gradsq > 0) {
|
||||
if (nb_iter <= 0)
|
||||
sprintf(output_file_gsq,"%s.bin",save_gradsq_file);
|
||||
else
|
||||
sprintf(output_file_gsq,"%s.%03d.bin",save_gradsq_file,nb_iter);
|
||||
|
||||
fgs = fopen(output_file_gsq,"wb");
|
||||
if (fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
|
||||
for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs);
|
||||
fclose(fgs);
|
||||
}
|
||||
}
|
||||
if (use_binary != 1) { // Save parameters in text file
|
||||
if (nb_iter <= 0)
|
||||
sprintf(output_file,"%s.txt",save_W_file);
|
||||
else
|
||||
sprintf(output_file,"%s.%03d.txt",save_W_file,nb_iter);
|
||||
if (save_gradsq > 0) {
|
||||
if (nb_iter <= 0)
|
||||
sprintf(output_file_gsq,"%s.txt",save_gradsq_file);
|
||||
else
|
||||
sprintf(output_file_gsq,"%s.%03d.txt",save_gradsq_file,nb_iter);
|
||||
|
||||
fgs = fopen(output_file_gsq,"wb");
|
||||
if (fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
|
||||
}
|
||||
fout = fopen(output_file,"wb");
|
||||
if (fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
|
||||
fid = fopen(vocab_file, "r");
|
||||
sprintf(format,"%%%ds",MAX_STRING_LENGTH);
|
||||
if (fid == NULL) {fprintf(stderr, "Unable to open file %s.\n",vocab_file); return 1;}
|
||||
if (write_header) fprintf(fout, "%lld %d\n", vocab_size, vector_size);
|
||||
for (a = 0; a < vocab_size; a++) {
|
||||
if (fscanf(fid,format,word) == 0) return 1;
|
||||
// input vocab cannot contain special <unk> keyword
|
||||
if (strcmp(word, "<unk>") == 0) return 1;
|
||||
fprintf(fout, "%s",word);
|
||||
if (model == 0) { // Save all parameters (including bias)
|
||||
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
|
||||
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]);
|
||||
}
|
||||
if (model == 1) // Save only "word" vectors (without bias)
|
||||
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
|
||||
if (model == 2) // Save "word + context word" vectors (without bias)
|
||||
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]);
|
||||
fprintf(fout,"\n");
|
||||
if (save_gradsq > 0) { // Save gradsq
|
||||
fprintf(fgs, "%s",word);
|
||||
for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[a * (vector_size + 1) + b]);
|
||||
for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[(vocab_size + a) * (vector_size + 1) + b]);
|
||||
fprintf(fgs,"\n");
|
||||
}
|
||||
if (fscanf(fid,format,word) == 0) return 1; // Eat irrelevant frequency entry
|
||||
}
|
||||
|
||||
if (use_unk_vec) {
|
||||
real* unk_vec = (real*)calloc((vector_size + 1), sizeof(real));
|
||||
real* unk_context = (real*)calloc((vector_size + 1), sizeof(real));
|
||||
word = "<unk>";
|
||||
|
||||
int num_rare_words = vocab_size < 100 ? vocab_size : 100;
|
||||
|
||||
for (a = vocab_size - num_rare_words; a < vocab_size; a++) {
|
||||
for (b = 0; b < (vector_size + 1); b++) {
|
||||
unk_vec[b] += W[a * (vector_size + 1) + b] / num_rare_words;
|
||||
unk_context[b] += W[(vocab_size + a) * (vector_size + 1) + b] / num_rare_words;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(fout, "%s",word);
|
||||
if (model == 0) { // Save all parameters (including bias)
|
||||
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_vec[b]);
|
||||
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_context[b]);
|
||||
}
|
||||
if (model == 1) // Save only "word" vectors (without bias)
|
||||
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b]);
|
||||
if (model == 2) // Save "word + context word" vectors (without bias)
|
||||
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b] + unk_context[b]);
|
||||
fprintf(fout,"\n");
|
||||
|
||||
free(unk_vec);
|
||||
free(unk_context);
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
fclose(fout);
|
||||
if (save_gradsq > 0) fclose(fgs);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Train model */
|
||||
int train_glove() {
|
||||
long long a, file_size;
|
||||
int save_params_return_code;
|
||||
int b;
|
||||
FILE *fin;
|
||||
real total_cost = 0;
|
||||
|
||||
fprintf(stderr, "TRAINING MODEL\n");
|
||||
|
||||
fin = fopen(input_file, "rb");
|
||||
if (fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;}
|
||||
fseeko(fin, 0, SEEK_END);
|
||||
file_size = ftello(fin);
|
||||
num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's
|
||||
fclose(fin);
|
||||
fprintf(stderr,"Read %lld lines.\n", num_lines);
|
||||
if (verbose > 1) fprintf(stderr,"Initializing parameters...");
|
||||
initialize_parameters();
|
||||
if (verbose > 1) fprintf(stderr,"done.\n");
|
||||
if (verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size);
|
||||
if (verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size);
|
||||
if (verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max);
|
||||
if (verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha);
|
||||
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
|
||||
lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));
|
||||
|
||||
time_t rawtime;
|
||||
struct tm *info;
|
||||
char time_buffer[80];
|
||||
// Lock-free asynchronous SGD
|
||||
for (b = 0; b < num_iter; b++) {
|
||||
total_cost = 0;
|
||||
for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads;
|
||||
lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads;
|
||||
long long *thread_ids = (long long*)malloc(sizeof(long long) * num_threads);
|
||||
for (a = 0; a < num_threads; a++) thread_ids[a] = a;
|
||||
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)&thread_ids[a]);
|
||||
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
|
||||
for (a = 0; a < num_threads; a++) total_cost += cost[a];
|
||||
free(thread_ids);
|
||||
|
||||
time(&rawtime);
|
||||
info = localtime(&rawtime);
|
||||
strftime(time_buffer,80,"%x - %I:%M.%S%p", info);
|
||||
fprintf(stderr, "%s, iter: %03d, cost: %lf\n", time_buffer, b+1, total_cost/num_lines);
|
||||
|
||||
if (checkpoint_every > 0 && (b + 1) % checkpoint_every == 0) {
|
||||
fprintf(stderr," saving itermediate parameters for iter %03d...", b+1);
|
||||
save_params_return_code = save_params(b+1);
|
||||
if (save_params_return_code != 0)
|
||||
return save_params_return_code;
|
||||
fprintf(stderr,"done.\n");
|
||||
}
|
||||
|
||||
}
|
||||
free(pt);
|
||||
free(lines_per_thread);
|
||||
return save_params(0);
|
||||
}
|
||||
|
||||
int find_arg(char *str, int argc, char **argv) {
|
||||
int i;
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (!scmp(str, argv[i])) {
|
||||
if (i == argc - 1) {
|
||||
printf("No argument given for %s\n", str);
|
||||
exit(1);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
FILE *fid;
|
||||
vocab_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
input_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
save_W_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
save_gradsq_file = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
int result = 0;
|
||||
|
||||
if (argc == 1) {
|
||||
printf("GloVe: Global Vectors for Word Representation, v0.2\n");
|
||||
printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
|
||||
printf("Usage options:\n");
|
||||
printf("\t-verbose <int>\n");
|
||||
printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
|
||||
printf("\t-write-header <int>\n");
|
||||
printf("\t\tIf 1, write vocab_size/vector_size as first line. Do nothing if 0 (default).\n");
|
||||
printf("\t-vector-size <int>\n");
|
||||
printf("\t\tDimension of word vector representations (excluding bias term); default 50\n");
|
||||
printf("\t-threads <int>\n");
|
||||
printf("\t\tNumber of threads; default 8\n");
|
||||
printf("\t-iter <int>\n");
|
||||
printf("\t\tNumber of training iterations; default 25\n");
|
||||
printf("\t-eta <float>\n");
|
||||
printf("\t\tInitial learning rate; default 0.05\n");
|
||||
printf("\t-alpha <float>\n");
|
||||
printf("\t\tParameter in exponent of weighting function; default 0.75\n");
|
||||
printf("\t-x-max <float>\n");
|
||||
printf("\t\tParameter specifying cutoff in weighting function; default 100.0\n");
|
||||
printf("\t-binary <int>\n");
|
||||
printf("\t\tSave output in binary format (0: text, 1: binary, 2: both); default 0\n");
|
||||
printf("\t-model <int>\n");
|
||||
printf("\t\tModel for word vector output (for text output only); default 2\n");
|
||||
printf("\t\t 0: output all data, for both word and context word vectors, including bias terms\n");
|
||||
printf("\t\t 1: output word vectors, excluding bias terms\n");
|
||||
printf("\t\t 2: output word vectors + context word vectors, excluding bias terms\n");
|
||||
printf("\t-input-file <file>\n");
|
||||
printf("\t\tBinary input file of shuffled cooccurrence data (produced by 'cooccur' and 'shuffle'); default cooccurrence.shuf.bin\n");
|
||||
printf("\t-vocab-file <file>\n");
|
||||
printf("\t\tFile containing vocabulary (truncated unigram counts, produced by 'vocab_count'); default vocab.txt\n");
|
||||
printf("\t-save-file <file>\n");
|
||||
printf("\t\tFilename, excluding extension, for word vector output; default vectors\n");
|
||||
printf("\t-gradsq-file <file>\n");
|
||||
printf("\t\tFilename, excluding extension, for squared gradient output; default gradsq\n");
|
||||
printf("\t-save-gradsq <int>\n");
|
||||
printf("\t\tSave accumulated squared gradients; default 0 (off); ignored if gradsq-file is specified\n");
|
||||
printf("\t-checkpoint-every <int>\n");
|
||||
printf("\t\tCheckpoint a model every <int> iterations; default 0 (off)\n");
|
||||
printf("\nExample usage:\n");
|
||||
printf("./glove -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file vectors -gradsq-file gradsq -verbose 2 -vector-size 100 -threads 16 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 2 -model 2\n\n");
|
||||
result = 0;
|
||||
} else {
|
||||
if ((i = find_arg((char *)"-write-header", argc, argv)) > 0) write_header = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-vector-size", argc, argv)) > 0) vector_size = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-iter", argc, argv)) > 0) num_iter = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
|
||||
cost = malloc(sizeof(real) * num_threads);
|
||||
if ((i = find_arg((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-x-max", argc, argv)) > 0) x_max = atof(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-eta", argc, argv)) > 0) eta = atof(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-binary", argc, argv)) > 0) use_binary = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-model", argc, argv)) > 0) model = atoi(argv[i + 1]);
|
||||
if (model != 0 && model != 1) model = 2;
|
||||
if ((i = find_arg((char *)"-save-gradsq", argc, argv)) > 0) save_gradsq = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-vocab-file", argc, argv)) > 0) strcpy(vocab_file, argv[i + 1]);
|
||||
else strcpy(vocab_file, (char *)"vocab.txt");
|
||||
if ((i = find_arg((char *)"-save-file", argc, argv)) > 0) strcpy(save_W_file, argv[i + 1]);
|
||||
else strcpy(save_W_file, (char *)"vectors");
|
||||
if ((i = find_arg((char *)"-gradsq-file", argc, argv)) > 0) {
|
||||
strcpy(save_gradsq_file, argv[i + 1]);
|
||||
save_gradsq = 1;
|
||||
}
|
||||
else if (save_gradsq > 0) strcpy(save_gradsq_file, (char *)"gradsq");
|
||||
if ((i = find_arg((char *)"-input-file", argc, argv)) > 0) strcpy(input_file, argv[i + 1]);
|
||||
else strcpy(input_file, (char *)"cooccurrence.shuf.bin");
|
||||
if ((i = find_arg((char *)"-checkpoint-every", argc, argv)) > 0) checkpoint_every = atoi(argv[i + 1]);
|
||||
|
||||
vocab_size = 0;
|
||||
fid = fopen(vocab_file, "r");
|
||||
if (fid == NULL) {fprintf(stderr, "Unable to open vocab file %s.\n",vocab_file); return 1;}
|
||||
while ((i = getc(fid)) != EOF) if (i == '\n') vocab_size++; // Count number of entries in vocab_file
|
||||
fclose(fid);
|
||||
|
||||
result = train_glove();
|
||||
free(cost);
|
||||
}
|
||||
free(vocab_file);
|
||||
free(input_file);
|
||||
free(save_W_file);
|
||||
free(save_gradsq_file);
|
||||
return result;
|
||||
}
|
|
@ -0,0 +1,221 @@
|
|||
// Tool to shuffle entries of word-word cooccurrence files
|
||||
//
|
||||
// Copyright (c) 2014 The Board of Trustees of
|
||||
// The Leland Stanford Junior University. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
//
|
||||
// For more information, bug reports, fixes, contact:
|
||||
// Jeffrey Pennington (jpennin@stanford.edu)
|
||||
// GlobalVectors@googlegroups.com
|
||||
// http://nlp.stanford.edu/projects/glove/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define MAX_STRING_LENGTH 1000
|
||||
|
||||
static const long LRAND_MAX = ((long) RAND_MAX + 2) * (long)RAND_MAX;
|
||||
typedef double real;
|
||||
|
||||
typedef struct cooccur_rec {
|
||||
int word1;
|
||||
int word2;
|
||||
real val;
|
||||
} CREC;
|
||||
|
||||
int verbose = 2; // 0, 1, or 2
|
||||
long long array_size = 2000000; // size of chunks to shuffle individually
|
||||
char *file_head; // temporary file string
|
||||
real memory_limit = 2.0; // soft limit, in gigabytes
|
||||
|
||||
/* Efficient string comparison */
|
||||
int scmp( char *s1, char *s2 ) {
|
||||
while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
|
||||
return(*s1 - *s2);
|
||||
}
|
||||
|
||||
|
||||
/* Generate uniformly distributed random long ints */
|
||||
static long rand_long(long n) {
|
||||
long limit = LRAND_MAX - LRAND_MAX % n;
|
||||
long rnd;
|
||||
do {
|
||||
rnd = ((long)RAND_MAX + 1) * (long)rand() + (long)rand();
|
||||
} while (rnd >= limit);
|
||||
return rnd % n;
|
||||
}
|
||||
|
||||
/* Write contents of array to binary file */
|
||||
int write_chunk(CREC *array, long size, FILE *fout) {
|
||||
long i = 0;
|
||||
for (i = 0; i < size; i++) fwrite(&array[i], sizeof(CREC), 1, fout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Fisher-Yates shuffle */
|
||||
void shuffle(CREC *array, long n) {
|
||||
long i, j;
|
||||
CREC tmp;
|
||||
for (i = n - 1; i > 0; i--) {
|
||||
j = rand_long(i + 1);
|
||||
tmp = array[j];
|
||||
array[j] = array[i];
|
||||
array[i] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/* Merge shuffled temporary files; doesn't necessarily produce a perfect shuffle, but good enough */
|
||||
int shuffle_merge(int num) {
|
||||
long i, j, k, l = 0;
|
||||
int fidcounter = 0;
|
||||
CREC *array;
|
||||
char filename[MAX_STRING_LENGTH];
|
||||
FILE **fid, *fout = stdout;
|
||||
|
||||
array = malloc(sizeof(CREC) * array_size);
|
||||
fid = malloc(sizeof(FILE) * num);
|
||||
for (fidcounter = 0; fidcounter < num; fidcounter++) { //num = number of temporary files to merge
|
||||
sprintf(filename,"%s_%04d.bin",file_head, fidcounter);
|
||||
fid[fidcounter] = fopen(filename, "rb");
|
||||
if (fid[fidcounter] == NULL) {
|
||||
fprintf(stderr, "Unable to open file %s.\n",filename);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (verbose > 0) fprintf(stderr, "Merging temp files: processed %ld lines.", l);
|
||||
|
||||
while (1) { //Loop until EOF in all files
|
||||
i = 0;
|
||||
//Read at most array_size values into array, roughly array_size/num from each temp file
|
||||
for (j = 0; j < num; j++) {
|
||||
if (feof(fid[j])) continue;
|
||||
for (k = 0; k < array_size / num; k++){
|
||||
fread(&array[i], sizeof(CREC), 1, fid[j]);
|
||||
if (feof(fid[j])) break;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
if (i == 0) break;
|
||||
l += i;
|
||||
shuffle(array, i-1); // Shuffles lines between temp files
|
||||
write_chunk(array,i,fout);
|
||||
if (verbose > 0) fprintf(stderr, "\033[31G%ld lines.", l);
|
||||
}
|
||||
fprintf(stderr, "\033[0GMerging temp files: processed %ld lines.", l);
|
||||
for (fidcounter = 0; fidcounter < num; fidcounter++) {
|
||||
fclose(fid[fidcounter]);
|
||||
sprintf(filename,"%s_%04d.bin",file_head, fidcounter);
|
||||
remove(filename);
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
free(array);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Shuffle large input stream by splitting into chunks */
|
||||
int shuffle_by_chunks() {
|
||||
long i = 0, l = 0;
|
||||
int fidcounter = 0;
|
||||
char filename[MAX_STRING_LENGTH];
|
||||
CREC *array;
|
||||
FILE *fin = stdin, *fid;
|
||||
array = malloc(sizeof(CREC) * array_size);
|
||||
|
||||
fprintf(stderr,"SHUFFLING COOCCURRENCES\n");
|
||||
if (verbose > 0) fprintf(stderr,"array size: %lld\n", array_size);
|
||||
sprintf(filename,"%s_%04d.bin",file_head, fidcounter);
|
||||
fid = fopen(filename,"w");
|
||||
if (fid == NULL) {
|
||||
fprintf(stderr, "Unable to open file %s.\n",filename);
|
||||
return 1;
|
||||
}
|
||||
if (verbose > 1) fprintf(stderr, "Shuffling by chunks: processed 0 lines.");
|
||||
|
||||
while (1) { //Continue until EOF
|
||||
if (i >= array_size) {// If array is full, shuffle it and save to temporary file
|
||||
shuffle(array, i-2);
|
||||
l += i;
|
||||
if (verbose > 1) fprintf(stderr, "\033[22Gprocessed %ld lines.", l);
|
||||
write_chunk(array,i,fid);
|
||||
fclose(fid);
|
||||
fidcounter++;
|
||||
sprintf(filename,"%s_%04d.bin",file_head, fidcounter);
|
||||
fid = fopen(filename,"w");
|
||||
if (fid == NULL) {
|
||||
fprintf(stderr, "Unable to open file %s.\n",filename);
|
||||
return 1;
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
fread(&array[i], sizeof(CREC), 1, fin);
|
||||
if (feof(fin)) break;
|
||||
i++;
|
||||
}
|
||||
shuffle(array, i-2); //Last chunk may be smaller than array_size
|
||||
write_chunk(array,i,fid);
|
||||
l += i;
|
||||
if (verbose > 1) fprintf(stderr, "\033[22Gprocessed %ld lines.\n", l);
|
||||
if (verbose > 1) fprintf(stderr, "Wrote %d temporary file(s).\n", fidcounter + 1);
|
||||
fclose(fid);
|
||||
free(array);
|
||||
return shuffle_merge(fidcounter + 1); // Merge and shuffle together temporary files
|
||||
}
|
||||
|
||||
int find_arg(char *str, int argc, char **argv) {
|
||||
int i;
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (!scmp(str, argv[i])) {
|
||||
if (i == argc - 1) {
|
||||
printf("No argument given for %s\n", str);
|
||||
exit(1);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
file_head = malloc(sizeof(char) * MAX_STRING_LENGTH);
|
||||
|
||||
if (argc == 1) {
|
||||
printf("Tool to shuffle entries of word-word cooccurrence files\n");
|
||||
printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
|
||||
printf("Usage options:\n");
|
||||
printf("\t-verbose <int>\n");
|
||||
printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
|
||||
printf("\t-memory <float>\n");
|
||||
printf("\t\tSoft limit for memory consumption, in GB; default 4.0\n");
|
||||
printf("\t-array-size <int>\n");
|
||||
printf("\t\tLimit to length <int> the buffer which stores chunks of data to shuffle before writing to disk. \n\t\tThis value overrides that which is automatically produced by '-memory'.\n");
|
||||
printf("\t-temp-file <file>\n");
|
||||
printf("\t\tFilename, excluding extension, for temporary files; default temp_shuffle\n");
|
||||
|
||||
printf("\nExample usage: (assuming 'cooccurrence.bin' has been produced by 'coccur')\n");
|
||||
printf("./shuffle -verbose 2 -memory 8.0 < cooccurrence.bin > cooccurrence.shuf.bin\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-temp-file", argc, argv)) > 0) strcpy(file_head, argv[i + 1]);
|
||||
else strcpy(file_head, (char *)"temp_shuffle");
|
||||
if ((i = find_arg((char *)"-memory", argc, argv)) > 0) memory_limit = atof(argv[i + 1]);
|
||||
array_size = (long long) (0.95 * (real)memory_limit * 1073741824/(sizeof(CREC)));
|
||||
if ((i = find_arg((char *)"-array-size", argc, argv)) > 0) array_size = atoll(argv[i + 1]);
|
||||
return shuffle_by_chunks();
|
||||
}
|
||||
|
|
@ -0,0 +1,262 @@
|
|||
// Tool to extract unigram counts
|
||||
//
|
||||
// GloVe: Global Vectors for Word Representation
|
||||
// Copyright (c) 2014 The Board of Trustees of
|
||||
// The Leland Stanford Junior University. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
//
|
||||
// For more information, bug reports, fixes, contact:
|
||||
// Jeffrey Pennington (jpennin@stanford.edu)
|
||||
// Christopher Manning (manning@cs.stanford.edu)
|
||||
// https://github.com/stanfordnlp/GloVe/
|
||||
// GlobalVectors@googlegroups.com
|
||||
// http://nlp.stanford.edu/projects/glove/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAX_STRING_LENGTH 1000
|
||||
#define TSIZE 1048576
|
||||
#define SEED 1159241
|
||||
|
||||
#define HASHFN bitwisehash
|
||||
|
||||
typedef struct vocabulary {
|
||||
char *word;
|
||||
long long count;
|
||||
} VOCAB;
|
||||
|
||||
typedef struct hashrec {
|
||||
char *word;
|
||||
long long count;
|
||||
struct hashrec *next;
|
||||
} HASHREC;
|
||||
|
||||
int verbose = 2; // 0, 1, or 2
|
||||
long long min_count = 1; // min occurrences for inclusion in vocab
|
||||
long long max_vocab = 0; // max_vocab = 0 for no limit
|
||||
|
||||
|
||||
/* Efficient string comparison */
|
||||
int scmp( char *s1, char *s2 ) {
|
||||
while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
|
||||
return *s1 - *s2;
|
||||
}
|
||||
|
||||
|
||||
/* Vocab frequency comparison; break ties alphabetically */
|
||||
int CompareVocabTie(const void *a, const void *b) {
|
||||
long long c;
|
||||
if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
|
||||
else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word));
|
||||
|
||||
}
|
||||
|
||||
/* Vocab frequency comparison; no tie-breaker */
|
||||
int CompareVocab(const void *a, const void *b) {
|
||||
long long c;
|
||||
if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
|
||||
else return 0;
|
||||
}
|
||||
|
||||
/* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */
|
||||
|
||||
/* Simple bitwise hash function */
|
||||
unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
|
||||
char c;
|
||||
unsigned int h;
|
||||
h = seed;
|
||||
for ( ; (c = *word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
|
||||
return (unsigned int)((h & 0x7fffffff) % tsize);
|
||||
}
|
||||
|
||||
/* Create hash table, initialise pointers to NULL */
|
||||
HASHREC ** inithashtable() {
|
||||
int i;
|
||||
HASHREC **ht;
|
||||
ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
|
||||
for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;
|
||||
return ht;
|
||||
}
|
||||
|
||||
/* Search hash table for given string, insert if not found */
|
||||
void hashinsert(HASHREC **ht, char *w) {
|
||||
HASHREC *htmp, *hprv;
|
||||
unsigned int hval = HASHFN(w, TSIZE, SEED);
|
||||
|
||||
for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
|
||||
if (htmp == NULL) {
|
||||
htmp = (HASHREC *) malloc( sizeof(HASHREC) );
|
||||
htmp->word = (char *) malloc( strlen(w) + 1 );
|
||||
strcpy(htmp->word, w);
|
||||
htmp->count = 1;
|
||||
htmp->next = NULL;
|
||||
if ( hprv==NULL )
|
||||
ht[hval] = htmp;
|
||||
else
|
||||
hprv->next = htmp;
|
||||
}
|
||||
else {
|
||||
/* new records are not moved to front */
|
||||
htmp->count++;
|
||||
if (hprv != NULL) {
|
||||
/* move to front on access */
|
||||
hprv->next = htmp->next;
|
||||
htmp->next = ht[hval];
|
||||
ht[hval] = htmp;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Read word from input stream. Return 1 when encounter '\n' or EOF (but separate from word), 0 otherwise.
|
||||
Words can be separated by space(s), tab(s), or newline(s). Carriage return characters are just ignored.
|
||||
(Okay for Windows, but not for Mac OS 9-. Ignored even if by themselves or in words.)
|
||||
A newline is taken as indicating a new document (contexts won't cross newline).
|
||||
Argument word array is assumed to be of size MAX_STRING_LENGTH.
|
||||
words will be truncated if too long. They are truncated with some care so that they
|
||||
cannot truncate in the middle of a utf-8 character, but
|
||||
still little to no harm will be done for other encodings like iso-8859-1.
|
||||
(This function appears identically copied in vocab_count.c and cooccur.c.)
|
||||
*/
|
||||
int get_word(char *word, FILE *fin) {
|
||||
int i = 0, ch;
|
||||
for ( ; ; ) {
|
||||
ch = fgetc(fin);
|
||||
if (ch == '\r') continue;
|
||||
if (i == 0 && ((ch == '\n') || (ch == EOF))) {
|
||||
word[i] = 0;
|
||||
return 1;
|
||||
}
|
||||
if (i == 0 && ((ch == ' ') || (ch == '\t'))) continue; // skip leading space
|
||||
if ((ch == EOF) || (ch == ' ') || (ch == '\t') || (ch == '\n')) {
|
||||
if (ch == '\n') ungetc(ch, fin); // return the newline next time as document ender
|
||||
break;
|
||||
}
|
||||
if (i < MAX_STRING_LENGTH - 1)
|
||||
word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH
|
||||
}
|
||||
word[i] = 0; //null terminate
|
||||
// avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0])
|
||||
// see https://en.wikipedia.org/wiki/UTF-8#Description
|
||||
if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) {
|
||||
if ((word[i-1] & 0xC0) == 0xC0) {
|
||||
word[i-1] = '\0';
|
||||
} else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) {
|
||||
word[i-2] = '\0';
|
||||
} else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) {
|
||||
word[i-3] = '\0';
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_counts() {
|
||||
long long i = 0, j = 0, vocab_size = 12500;
|
||||
// char format[20];
|
||||
char str[MAX_STRING_LENGTH + 1];
|
||||
HASHREC **vocab_hash = inithashtable();
|
||||
HASHREC *htmp;
|
||||
VOCAB *vocab;
|
||||
FILE *fid = stdin;
|
||||
|
||||
fprintf(stderr, "BUILDING VOCABULARY\n");
|
||||
if (verbose > 1) fprintf(stderr, "Processed %lld tokens.", i);
|
||||
// sprintf(format,"%%%ds",MAX_STRING_LENGTH);
|
||||
while ( ! feof(fid)) {
|
||||
// Insert all tokens into hashtable
|
||||
int nl = get_word(str, fid);
|
||||
if (nl) continue; // just a newline marker or feof
|
||||
if (strcmp(str, "<unk>") == 0) {
|
||||
fprintf(stderr, "\nError, <unk> vector found in corpus.\nPlease remove <unk>s from your corpus (e.g. cat text8 | sed -e 's/<unk>/<raw_unk>/g' > text8.new)");
|
||||
return 1;
|
||||
}
|
||||
hashinsert(vocab_hash, str);
|
||||
if (((++i)%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i);
|
||||
}
|
||||
if (verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i);
|
||||
vocab = malloc(sizeof(VOCAB) * vocab_size);
|
||||
for (i = 0; i < TSIZE; i++) { // Migrate vocab to array
|
||||
htmp = vocab_hash[i];
|
||||
while (htmp != NULL) {
|
||||
vocab[j].word = htmp->word;
|
||||
vocab[j].count = htmp->count;
|
||||
j++;
|
||||
if (j>=vocab_size) {
|
||||
vocab_size += 2500;
|
||||
vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size);
|
||||
}
|
||||
htmp = htmp->next;
|
||||
}
|
||||
}
|
||||
if (verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j);
|
||||
if (max_vocab > 0 && max_vocab < j)
|
||||
// If the vocabulary exceeds limit, first sort full vocab by frequency without alphabetical tie-breaks.
|
||||
// This results in pseudo-random ordering for words with same frequency, so that when truncated, the words span whole alphabet
|
||||
qsort(vocab, j, sizeof(VOCAB), CompareVocab);
|
||||
else max_vocab = j;
|
||||
qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically
|
||||
|
||||
for (i = 0; i < max_vocab; i++) {
|
||||
if (vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary
|
||||
if (verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count);
|
||||
break;
|
||||
}
|
||||
printf("%s %lld\n",vocab[i].word,vocab[i].count);
|
||||
}
|
||||
|
||||
if (i == max_vocab && max_vocab < j) if (verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab);
|
||||
fprintf(stderr, "Using vocabulary of size %lld.\n\n", i);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int find_arg(char *str, int argc, char **argv) {
|
||||
int i;
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (!scmp(str, argv[i])) {
|
||||
if (i == argc - 1) {
|
||||
printf("No argument given for %s\n", str);
|
||||
exit(1);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
if (argc == 1) {
|
||||
printf("Simple tool to extract unigram counts\n");
|
||||
printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
|
||||
printf("Usage options:\n");
|
||||
printf("\t-verbose <int>\n");
|
||||
printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
|
||||
printf("\t-max-vocab <int>\n");
|
||||
printf("\t\tUpper bound on vocabulary size, i.e. keep the <int> most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\n");
|
||||
printf("\t-min-count <int>\n");
|
||||
printf("\t\tLower limit such that words which occur fewer than <int> times are discarded.\n");
|
||||
printf("\nExample usage:\n");
|
||||
printf("./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-max-vocab", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]);
|
||||
if ((i = find_arg((char *)"-min-count", argc, argv)) > 0) min_count = atoll(argv[i + 1]);
|
||||
return get_counts();
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче