Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
chbrown committed Jul 12, 2013
0 parents commit b2503e1
Show file tree
Hide file tree
Showing 12 changed files with 1,974 additions and 0 deletions.
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CC = g++
LDFLAGS = -lgsl -lm -lgslcblas


LSOURCE = main.cpp corpus.cpp slda.cpp utils.cpp opt.cpp
LHEADER = corpus.h slda.h utils.h opt.h settings.h

slda: $(LSOURCE) $(HEADER)
$(CC) $(LSOURCE) -o $@ $(LDFLAGS)

clean:
-rm -f *.o slda
113 changes: 113 additions & 0 deletions corpus.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei

// written by Chong Wang, [email protected]

// This file is part of slda.

// slda is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or (at your
// option) any later version.

// slda is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
// USA

#include "corpus.h"
#include <assert.h>
#include <stdio.h>

corpus::corpus()
{
num_docs = 0;
size_vocab = 0;
num_classes = 0;
num_total_words = 0;
}

corpus::~corpus()
{
for (int i = 0; i < num_docs; i ++)
{
document * doc = docs[i];
delete doc;
}
docs.clear();

num_docs = 0;
size_vocab = 0;
num_classes = 0;
num_total_words = 0;
}

void corpus::read_data(const char * data_filename,
const char * label_filename)
{
int OFFSET = 0;
int length = 0, count = 0, word = 0,
n = 0, nd = 0, nw = 0, label = -1;

FILE * fileptr;
fileptr = fopen(data_filename, "r");
printf("\nreading data from %s\n", data_filename);
nd = 0;
nw = 0;

while ((fscanf(fileptr, "%10d", &length) != EOF))
{
document * doc = new document(length);
for (n = 0; n < length; n++)
{
fscanf(fileptr, "%10d:%10d", &word, &count);
word = word - OFFSET;
doc->words[n] = word;
doc->counts[n] = count;
doc->total += count;
if (word >= nw)
{
nw = word + 1;
}
}
num_total_words += doc->total;
docs.push_back(doc);
nd++;
}
fclose(fileptr);
num_docs = nd;
size_vocab = nw;
printf("number of docs : %d\n", nd);
printf("number of terms : %d\n", nw);
printf("number of total words : %d\n", num_total_words);

fileptr = fopen(label_filename, "r");
printf("\nreading labels from %s\n", label_filename);
nd = 0;
while ((fscanf(fileptr, "%10d", &label) != EOF))
{
document * doc = docs[nd];
doc->label = label;
if (label >= num_classes)
{
num_classes = label + 1;
}
nd ++;
}
assert(nd == int(docs.size()));
printf("number of classes : %d\n\n", num_classes);
}

int corpus::max_corpus_length() {
int max_length = 0;

for (int d = 0; d < num_docs; d++) {
if (docs[d]->length > max_length)
max_length = docs[d]->length;
}
return max_length;
}
81 changes: 81 additions & 0 deletions corpus.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei

// written by Chong Wang, [email protected]

// This file is part of slda.

// slda is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or (at your
// option) any later version.

// slda is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
// USA

#ifndef CORPUS_H
#define CORPUS_H

#include <vector>
using namespace std;

class document
{
public:
int * words;
int * counts;
int length;
int total;
int label;
public:
document()
{
words = NULL;
counts = NULL;
length = 0;
total = 0;
label = -1;
}
document(int len)
{
length = len;
words = new int [length];
counts = new int [length];
total = 0;
label = -1;
}
~document()
{
if (words != NULL)
{
delete [] words;
delete [] counts;
length = 0;
total = 0;
label = -1;
}
}
};

class corpus
{
public:
corpus();
~corpus();
void read_data(const char * data_filename, const char * label_filename);
int max_corpus_length();
public:
int num_docs;
int size_vocab;
int num_classes;
int num_total_words;
vector<document*> docs;
};

#endif // CORPUS_H
84 changes: 84 additions & 0 deletions main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei

// written by Chong Wang, [email protected]

// This file is part of slda.

// slda is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or (at your
// option) any later version.

// slda is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
// USA

#include <stdio.h>
#include <string.h>
#include "corpus.h"
#include "utils.h"
#include "slda.h"

void help( void ) {
printf("usage: slda [est] [data] [label] [settings] [alpha] [k] [random/seeded/model_path] [directory]\n");
printf(" slda [inf] [data] [label] [settings] [model] [directory]\n");
}

int main(int argc, char* argv[])
{
if (argc < 2)
{
help();
return 0;
}
if (strcmp(argv[1], "est") == 0)
{
corpus c;
char * data_filename = argv[2];
char * label_filename = argv[3];
c.read_data(data_filename, label_filename);
settings setting;
char * setting_filename = argv[4];
setting.read_settings(setting_filename);

double alpha = atof(argv[5]);
int num_topics = atoi(argv[6]);
printf("number of topics is %d\n", num_topics);
char * init_method = argv[7];
char * directory = argv[8];
printf("models will be saved in %s\n", directory);
make_directory(directory);

slda model;
model.init(alpha, num_topics, &c);
model.v_em(&c, &setting, init_method, directory);
}

if (strcmp(argv[1], "inf") == 0)
{
corpus c;
char * data_filename = argv[2];
char * label_filename = argv[3];
c.read_data(data_filename, label_filename);
settings setting;
char * setting_filename = argv[4];
setting.read_settings(setting_filename);

char * model_filename = argv[5];
char * directory = argv[6];
printf("\nresults will be saved in %s\n", directory);
make_directory(directory);

slda model;
model.load_model(model_filename);
model.infer_only(&c, &setting, directory);
}

return 0;
}
Loading

0 comments on commit b2503e1

Please sign in to comment.