Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search and Evaluation #7

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b78d308
moving goldstandard class to this branch
hansonhl Jun 18, 2018
753a07e
implementing menu andn toolbar option for evaluation
hansonhl Jun 18, 2018
4993ddf
removing temporary files
hansonhl Jun 18, 2018
7fd6bd8
created new MainMenu class and transferred code from mainwindow, made…
hansonhl Jun 18, 2018
01b16bb
adding comments to undertand Crab1 better
hansonhl Jun 19, 2018
bc04454
Tested and fixed bugs for gold standard evaluation
hansonhl Jun 19, 2018
f15a924
moved openXML functionality from GoldStandard class into mainwindow
hansonhl Jun 19, 2018
9f5d696
added treeView item for goldstandard, revised algorithm to include ov…
hansonhl Jun 20, 2018
0631f61
adding .pro.user
hansonhl Jun 21, 2018
5bcc682
Merge remote-tracking branch 'upstream/master' into gold-standard
hansonhl Jun 21, 2018
663da63
finishing model and table view of gold standard
hansonhl Jun 21, 2018
adcfabd
Merge remote-tracking branch 'upstream/master' into gold-standard
hansonhl Jun 22, 2018
b576132
starting on json output
hansonhl Jun 25, 2018
72445ca
finishing Morfessor parse file input, starting modification of tree view
hansonhl Jun 26, 2018
c8e6eae
adding handler class for old GSMap structure
hansonhl Jun 26, 2018
d07c5bc
starting on parsemap class
hansonhl Jun 26, 2018
7e87cfe
finishing table view of Morfessor parse evaluation
hansonhl Jun 26, 2018
94bf791
finishing eval parses object
hansonhl Jun 26, 2018
b7cb088
committing pro.user
hansonhl Jun 26, 2018
662687a
Merge branch 'gold-standard' into parsemap
hansonhl Jun 26, 2018
ad83334
cleaning evaluation code
hansonhl Jun 28, 2018
a6b24ba
finishing gui and signal connections for find functionality
hansonhl Jun 28, 2018
c431eeb
finishing dock widget for search
hansonhl Jul 5, 2018
a316d00
cleaning up code and correcting bugs in search function
hansonhl Jul 5, 2018
381db75
finishing search
hansonhl Jul 6, 2018
bfd52c9
Merge remote-tracking branch 'upstream/master' into search
hansonhl Jul 6, 2018
19a27df
adding documentation comments
hansonhl Jul 6, 2018
4b9f280
adding documentation comments
hansonhl Jul 6, 2018
522890b
supporting reading in Morfessor Parse txt with /STM or /PRE at end of…
hansonhl Jul 6, 2018
3cccfec
Merge branch 'master' into search
JohnAGoldsmith Jul 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 76 additions & 7 deletions QtLing/Lexicon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "SuffixCollection.h"
#include "WordCollection.h"
#include "Word.h"
#include "evaluation.h"
#include "cparse.h"

CLexicon::CLexicon( CLexicon* lexicon, bool suffix_flag)
Expand All @@ -39,6 +40,8 @@ CLexicon::CLexicon( CLexicon* lexicon, bool suffix_flag)
m_Hypothesis_map = new QMap<QString, CHypothesis*>;
m_entropy_threshold_for_stems = 1.2;
m_parent_lexicon = lexicon;
m_goldstandard = NULL;
m_eval_parses = NULL;

// This is part of an experiment.
m_category_types["Words"] = CT_word;
Expand Down Expand Up @@ -77,6 +80,7 @@ CLexicon::~CLexicon()
delete m_PrefixSignatures;
delete m_ParaSignatures;
delete m_PassiveSignatures;
delete m_goldstandard;
}

CSignatureCollection* CLexicon::get_active_signature_collection(){
Expand Down Expand Up @@ -112,10 +116,56 @@ void CLexicon::clear_lexicon(){
m_Hypotheses = new QList<CHypothesis*>;


}

// for gold standard
// Return true if evaluation succeeded
// Return false if it did not
GoldStandard* CLexicon::new_goldstandard_from_xml(QString& file_name)
{
m_goldstandard = new GoldStandard(file_name);
return m_goldstandard;
}

bool CLexicon::do_gs_evaluation()
{
if (m_goldstandard == nullptr) {
qDebug() << 134 << "Lexicon.cpp: evaluation failed: GoldStandard not loaded";
return false;
}
bool evaluation_succeeded = m_goldstandard->evaluate(m_Words);
if (evaluation_succeeded) {
qDebug() << 139 << "Lexicon.cpp: evaluation completed";
return true;
} else return false;
}

EvalParses* CLexicon::new_eval_parses_from_txt(QString& file_name)
{
m_eval_parses = new EvalParses(file_name);
return m_eval_parses;
}

void CLexicon::delete_eval_parses()
{
delete m_eval_parses;
m_eval_parses = NULL;
}

bool CLexicon::do_gs_evaluation_on_eval_parses()
{
if (m_goldstandard == NULL || m_eval_parses == NULL) {
qDebug() << 153 << "Lexicon.cpp: evaluation failed: GoldStandard or evaluation file not loaded";
return false;
}
bool evaluation_succeeded = m_goldstandard->evaluate(m_eval_parses);
if (evaluation_succeeded) {
qDebug() << 158 << "Lexicon.cpp: evaluation on imported parses completed";
return true;
} else return false;
}


////////////////////////////////////////////////////////////////////////////////////////////////////
//linguistic methods
/**
Expand All @@ -138,6 +188,12 @@ void CLexicon::dump_signatures_to_debug()

}

/* Crab_1:
* Used after MainWindow::read_dx1_file, which parses the dx1 file and
* stores words and their counts into CWordCollection object in Lexicon,
* and generates the SortedStringArrays.
*
*/
void CLexicon::Crab_1()
{
FindProtostems();
Expand Down Expand Up @@ -166,6 +222,9 @@ void CLexicon::Crab_1()
* This is the first of the three initial parts of finding signatures.
* This makes a cut at every point in a word where the successor frequency
* is greater than 1.
* Taking the sorted string array as input, finds protoroots and stores them
* by modifying m_suffix_protostems_2 (for suffixes)
* or m_prefix_protostems_2 (for prefixes)
*/
void CLexicon::FindProtostems()
{ word_t this_word, previous_word;
Expand Down Expand Up @@ -268,7 +327,9 @@ void CLexicon::FindProtostems()

/*!
* This is the second of the three initial parts of finding signatures.
* This creates stem/affix pairs, which are put in a long list of "Parses".
* This creates stem/affix pairs, which are put in a long list of "Parses":
* QList<QPair<QString,QString>>* m_Parses
*
*/
void CLexicon::CreateStemAffixPairs()
{
Expand Down Expand Up @@ -377,7 +438,9 @@ void CLexicon::assign_suffixes_to_stems(QString name_of_calling_function)
stem_list * p_this_stem_list;
//affix_set * this_ptr_to_affix_set(NULL);
map_sigstring_to_suffix_set temp_stems_to_affix_set;
// Equivalent to QMap<QString, QList<QString>>
map_sigstring_to_stem_list temp_signatures_to_stems;
// Equivalent to QSet<QString>
morph_set * pSet;
CWord* pWord;
m_ProgressBar->reset();
Expand All @@ -400,6 +463,7 @@ void CLexicon::assign_suffixes_to_stems(QString name_of_calling_function)
m_StatusBar->showMessage("Form signatures: 2. tentative signatures.");

int count= 0;
// equivalent to QMap<QString, QSet<String>*>::iterator
QMapIterator<QString, morph_set*> stem_iter(temp_stems_to_affix_set); // part 1
while (stem_iter.hasNext()) // make a presignature for each stem.
{ qApp->processEvents();
Expand Down Expand Up @@ -442,7 +506,9 @@ void CLexicon::assign_suffixes_to_stems(QString name_of_calling_function)
affix_list this_affix_list = this_signature_string.split("=");
if (p_this_stem_list->size() >= MINIMUM_NUMBER_OF_STEMS)
{
// put signature strings into m_Signatures
if( m_SuffixesFlag) {
// CSignature* pSig;
pSig = *m_Signatures << this_signature_string;
} else {
pSig = *m_PrefixSignatures << this_signature_string;
Expand All @@ -453,6 +519,8 @@ void CLexicon::assign_suffixes_to_stems(QString name_of_calling_function)
this_affix_t = affix_iter_2.next();
link_signature_and_affix(pSig,this_affix_t);
}
// for each stem in the list of stems in the map of signatures,
// use the function link_signature_and_stem()
stem_list_iterator stem_iter(*p_this_stem_list);
while (stem_iter.hasNext()){
this_stem_t = stem_iter.next();
Expand Down Expand Up @@ -514,13 +582,17 @@ void CLexicon::link_signature_and_affix(CSignature * pSig, affix_t this_affix)
}
void CLexicon::link_signature_and_stem(stem_t this_stem_t , CSignature* pSig, QString this_signature_string)
{
// add a CStem object into m_suffixal_stems/m_prefixal_stems:
// - add CSignature pointer to that CStem object
// - add CStem pointer to CSignature object
CStem* pStem;
QString this_affix, this_word;
m_SuffixesFlag ?
pStem = m_suffixal_stems->find_or_add(this_stem_t):
pStem = m_prefixal_stems->find_or_add(this_stem_t);
pStem->add_signature (pSig);
pSig->add_stem_pointer(pStem);

int stem_count = 0;
affix_list this_affix_list = this_signature_string.split("=");
QListIterator<suffix_t> affix_iter(this_affix_list);
Expand All @@ -535,7 +607,7 @@ void CLexicon::link_signature_and_stem(stem_t this_stem_t , CSignature* pSig,
}
CWord* pWord = m_Words->get_word(this_word);
if (!pWord){
qDebug() << this_word << "Error: this_word not found among words. Line 486" << this_stem_t << this_affix << pSig->get_key() << this_signature_string;
qDebug() << this_word << "Error: this_word not found among words. Line 505" << this_stem_t << this_affix << pSig->get_key() << this_signature_string;
} else{
stem_count += pWord->get_word_count();
pWord->add_parse_triple(this_stem_t, this_affix, pSig->get_key());
Expand All @@ -546,6 +618,8 @@ void CLexicon::link_signature_and_stem(stem_t this_stem_t , CSignature* pSig,
}
pStem->set_count(stem_count);
}


bool contains(QList<QString> * list2, QList<QString> * list1){
for (int i=0; i < list1->length();i++){
bool success = false;
Expand Down Expand Up @@ -787,8 +861,3 @@ void CLexicon::collect_parasuffixes()
}
m_ParaSuffixes->sort_by_count();
};





31 changes: 27 additions & 4 deletions QtLing/Lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
#include <QStatusBar>
#include "SignatureCollection.h"
#include "Typedefs.h"
#include "evaluation.h"

class MainWindow;
class CWordCollection;
class CStemCollection;
class CSuffixCollection;
class CPrefixCollection;
class QProgressBar;
class CHypothesis;

class CParse;

// part of an experiment:
Expand Down Expand Up @@ -139,9 +140,9 @@ class CLexicon
CPrefixCollection * m_Prefixes;
CSignatureCollection * m_Signatures;
CSignatureCollection * m_PrefixSignatures;
CWordCollection * m_Compounds;
CWordCollection * m_Compounds; // nothing done yet
//QList<QPair<QString,QString>> * m_Parses;
QList<CParse*> * m_Parses;
QList<CParse*> * m_Parses; //

QMap<QString,int> m_Parse_map;
// QMap<QString, int> m_suffix_protostems;
Expand All @@ -151,22 +152,27 @@ class CLexicon
// with a particular proto-stem (i.e., a word-beginning). This replaces using a huge signature to store
// that same information.
QMap<QString, protostem*> m_suffix_protostems_2;
QMap<QString, protostem*> m_prefix_protostems_2;
QMap<QString, protostem*> m_prefix_protostems_2; // temporary data structures for crab1


bool m_SuffixesFlag;
CLexicon* m_parent_lexicon;

// all of the possible continuations
// affixes that are "thrown out" in Crab2
CSignatureCollection* m_ParaSignatures; /*!< the information we have about stems which we have not yet integrated into a morphological system. */
CSuffixCollection * m_ParaSuffixes;
CStemCollection * m_ResidualStems;
CSignatureCollection * m_ResidualPrefixSignatures;
CStemCollection * m_StemsFromSubsignatures;
CSignatureCollection* m_Subsignatures;

// Finds the difference between signatures, e.g. {ed, es, er, e, ing} vs {d, s, r, NULL}
QList<simple_sig_graph_edge*> m_SigGraphEdgeList; /*!< the sig_graph_edges in here contain only one word associated with each. */
lxa_sig_graph_edge_map m_SigGraphEdgeMap; /*!< the sig_graph_edges in here contain lists of words associated with them. */
CSignatureCollection * m_PassiveSignatures; /*!< these signatures have stems one letter off from another signature. */
CSignatureCollection * m_SequentialSignatures; /*! signatures where one affix leads to another signature. */
// Generalizes repeating
QList<CHypothesis*> * m_Hypotheses;
QMap<QString, CHypothesis*> * m_Hypothesis_map;
// add component 1
Expand All @@ -176,12 +182,29 @@ class CLexicon

double m_entropy_threshold_for_stems;

// experiment for gold standard evaluation code
GoldStandard* m_goldstandard;
EvalParses* m_eval_parses;
// end of experiment

public:
CLexicon(CLexicon* parent_lexicon = NULL, bool suffix_flag = true);
public:

~CLexicon();

// experiment for gold standard evaluation code
GoldStandard* get_goldstandard() { return m_goldstandard; }
GoldStandard* new_goldstandard_from_xml(QString& file_name);
void delete_goldstandard() { delete m_goldstandard; m_goldstandard = NULL; }
bool do_gs_evaluation();

EvalParses* get_eval_parses() { return m_eval_parses; }
EvalParses* new_eval_parses_from_txt(QString& file_name);
void delete_eval_parses();
bool do_gs_evaluation_on_eval_parses();


void dump_signatures_to_debug();
// accessors and protostems
void dump_suffixes(QList<QString>*);
Expand Down
13 changes: 12 additions & 1 deletion QtLing/QtLing.pro
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
#-------------------------------------------------

QT += core gui
QT += core gui xml

greaterThan(QT_MAJOR_VERSION, 4): QT += widgets

Expand Down Expand Up @@ -40,6 +40,14 @@ SOURCES += main.cpp\
signature_graph.cpp \
string_group.cpp \
cparse.cpp \
lexicon_json.cpp \
evaluation.cpp \
evaluation_goldstandard.cpp \
evaluation_evalparses.cpp \
evaluation_parsemap.cpp \
mainwindow_find.cpp \
mainwindow_menubar.cpp \
mainwindow_actions.cpp \
stringalignment.cpp \
allosignatures.cpp

Expand All @@ -65,6 +73,9 @@ HEADERS += mainwindow.h \
mainwindow.h \
string_group.h \
cparse.h \
evaluation.h \
mainwindow_find.h \
mainwindow_menubar.h \
stringalignment.h \
allosignatures.h

Expand Down
Loading