Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search and Evaluation #7

Open
wants to merge 30 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b78d308
moving goldstandard class to this branch
hansonhl Jun 18, 2018
753a07e
implementing menu andn toolbar option for evaluation
hansonhl Jun 18, 2018
4993ddf
removing temporary files
hansonhl Jun 18, 2018
7fd6bd8
created new MainMenu class and transferred code from mainwindow, made…
hansonhl Jun 18, 2018
01b16bb
adding comments to undertand Crab1 better
hansonhl Jun 19, 2018
bc04454
Tested and fixed bugs for gold standard evaluation
hansonhl Jun 19, 2018
f15a924
moved openXML functionality from GoldStandard class into mainwindow
hansonhl Jun 19, 2018
9f5d696
added treeView item for goldstandard, revised algorithm to include ov…
hansonhl Jun 20, 2018
0631f61
adding .pro.user
hansonhl Jun 21, 2018
5bcc682
Merge remote-tracking branch 'upstream/master' into gold-standard
hansonhl Jun 21, 2018
663da63
finishing model and table view of gold standard
hansonhl Jun 21, 2018
adcfabd
Merge remote-tracking branch 'upstream/master' into gold-standard
hansonhl Jun 22, 2018
b576132
starting on json output
hansonhl Jun 25, 2018
72445ca
finishing Morfessor parse file input, starting modification of tree view
hansonhl Jun 26, 2018
c8e6eae
adding handler class for old GSMap structure
hansonhl Jun 26, 2018
d07c5bc
starting on parsemap class
hansonhl Jun 26, 2018
7e87cfe
finishing table view of Morfessor parse evaluation
hansonhl Jun 26, 2018
94bf791
finishing eval parses object
hansonhl Jun 26, 2018
b7cb088
committing pro.user
hansonhl Jun 26, 2018
662687a
Merge branch 'gold-standard' into parsemap
hansonhl Jun 26, 2018
ad83334
cleaning evaluation code
hansonhl Jun 28, 2018
a6b24ba
finishing gui and signal connections for find functionality
hansonhl Jun 28, 2018
c431eeb
finishing dock widget for search
hansonhl Jul 5, 2018
a316d00
cleaning up code and correcting bugs in search function
hansonhl Jul 5, 2018
381db75
finishing search
hansonhl Jul 6, 2018
bfd52c9
Merge remote-tracking branch 'upstream/master' into search
hansonhl Jul 6, 2018
19a27df
adding documentation comments
hansonhl Jul 6, 2018
4b9f280
adding documentation comments
hansonhl Jul 6, 2018
522890b
supporting reading in Morfessor Parse txt with /STM or /PRE at end of…
hansonhl Jul 6, 2018
3cccfec
Merge branch 'master' into search
JohnAGoldsmith Jul 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 124 additions & 7 deletions QtLing/Lexicon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "SuffixCollection.h"
#include "WordCollection.h"
#include "Word.h"
#include "evaluation.h"
#include "cparse.h"

void SortQStringListFromRight(QStringList& ThisStringList);
Expand Down Expand Up @@ -63,6 +64,8 @@ CLexicon::CLexicon( CLexicon* lexicon, bool suffix_flag)
m_Hypothesis_map = new QMap<QString, CHypothesis*>;
m_entropy_threshold_for_stems = 1.2;
m_parent_lexicon = lexicon;
m_goldstandard = NULL;
m_eval_parses = NULL;

m_category_types["Words"] = CT_word;
m_category_types["Suffixal stems"] = CT_stem;
Expand Down Expand Up @@ -100,6 +103,7 @@ CLexicon::~CLexicon()
delete m_PrefixSignatures;
delete m_ParaSignatures;
delete m_PassiveSignatures;
delete m_goldstandard;
delete m_Parses;
}

Expand Down Expand Up @@ -143,10 +147,54 @@ void CLexicon::clear_lexicon(){
m_Hypotheses = new QList<CHypothesis*>;


}

// for gold standard
// Return true if evaluation succeeded
// Return false if it did not
GoldStandard* CLexicon::new_goldstandard_from_xml(QString& file_name)
{
m_goldstandard = new GoldStandard(file_name);
return m_goldstandard;
}

bool CLexicon::do_gs_evaluation()
{
if (m_goldstandard == nullptr) {
qDebug() << 134 << "Lexicon.cpp: evaluation failed: GoldStandard not loaded";
return false;
}
bool evaluation_succeeded = m_goldstandard->evaluate(m_Words);
if (evaluation_succeeded) {
qDebug() << 139 << "Lexicon.cpp: evaluation completed";
return true;
} else return false;
}

EvalParses* CLexicon::new_eval_parses_from_txt(QString& file_name)
{
m_eval_parses = new EvalParses(file_name);
return m_eval_parses;
}

void CLexicon::delete_eval_parses()
{
delete m_eval_parses;
m_eval_parses = NULL;
}


bool CLexicon::do_gs_evaluation_on_eval_parses()
{
if (m_goldstandard == NULL || m_eval_parses == NULL) {
qDebug() << 153 << "Lexicon.cpp: evaluation failed: GoldStandard or evaluation file not loaded";
return false;
}
bool evaluation_succeeded = m_goldstandard->evaluate(m_eval_parses);
if (evaluation_succeeded) {
qDebug() << 158 << "Lexicon.cpp: evaluation on imported parses completed";
return true;
} else return false;

bool CLexicon::stem_autobiographies_contains(QString stem) {
//qDebug() << 121 << stem << m_stem_autobiographies.contains(stem);
Expand Down Expand Up @@ -225,6 +273,12 @@ void CLexicon::dump_signatures_to_debug()

}

/* Crab_1:
* Used after MainWindow::read_dx1_file, which parses the dx1 file and
* stores words and their counts into CWordCollection object in Lexicon,
* and generates the SortedStringArrays.
*
*/
void CLexicon::Crab_1()
{
step1_from_words_to_protostems();
Expand Down Expand Up @@ -259,6 +313,11 @@ void CLexicon::Crab_1()
* This is the first of the three initial parts of finding signatures.
* This makes a cut at every point in a word where the successor frequency
* is greater than 1.

* Taking the sorted string array as input, finds protoroots and stores them
* by modifying m_suffix_protostems_2 (for suffixes)
* or m_prefix_protostems_2 (for prefixes)

* It is divided into two parts; the first finds protostems, by detecting
* successfor frequency greater than 1; the second breaks a word after a protostem.
*/
Expand Down Expand Up @@ -402,7 +461,9 @@ void CLexicon::step1_from_words_to_protostems()

/*!
* This is the second of the three initial parts of finding signatures.
* This creates stem/affix pairs, which are put in a long list of "Parses".
* This creates stem/affix pairs, which are put in a long list of "Parses":
* QList<QPair<QString,QString>>* m_Parses
*
*/
void CLexicon::step2_from_protostems_to_parses()
{
Expand Down Expand Up @@ -487,9 +548,14 @@ void CLexicon::step3_from_parses_to_stem_to_sig_maps(QString name_of_calling_f
{ // const int MINIMUM_NUMBER_OF_STEMS = 2;

QString this_stem_t, this_suffix, this_prefix, this_affix_t, this_signature_string, this_word;
stem_list * p_this_stem_list;
//affix_set * this_ptr_to_affix_set(NULL);
map_sigstring_to_suffix_set temp_stems_to_affix_set;
// Equivalent to QMap<QString, QList<QString>>
map_sigstring_to_stem_list temp_signatures_to_stems;
// Equivalent to QSet<QString>
Stem_to_sig_map these_stem_to_sig_maps;
m_intermediate_signature_to_stems_map.clear(); //replaces "this_signature_to_stems_map"

morph_set * pSet;
CWord* pWord;

Expand All @@ -513,6 +579,31 @@ void CLexicon::step3_from_parses_to_stem_to_sig_maps(QString name_of_calling_f
m_ProgressBar->setMaximum(these_stem_to_sig_maps.count());

int count= 0;

// following lines should be removed, no? JG

//



if (false) {
// equivalent to QMap<QString, QSet<String>*>::iterator
QMapIterator<QString, morph_set*> stem_iter(temp_stems_to_affix_set); // part 1
while (stem_iter.hasNext()) // make a presignature for each stem.
{ qApp->processEvents();
count ++;
m_ProgressBar->setValue(count);
stem_iter.next();
this_stem_t = stem_iter.key();
this_signature_string = convert_set_to_qstring (stem_iter.value());
if ( ! temp_signatures_to_stems.contains(this_signature_string)){
stem_list * pStemSet = new stem_list;
temp_signatures_to_stems[this_signature_string] = pStemSet;
}
temp_signatures_to_stems.value(this_signature_string)->append(this_stem_t);
}
} // end of if false;

foreach(QString this_stem_t, these_stem_to_sig_maps.keys()){
count++; m_ProgressBar->setValue(count);

Expand Down Expand Up @@ -621,7 +712,9 @@ void CLexicon::step4_create_signatures(QString name_of_calling_function)
affix_list this_affix_list = this_signature_string.split("=");
if (this_stem_set->size() >= MINIMUM_NUMBER_OF_STEMS)
{
// put signature strings into m_Signatures
if( m_SuffixesFlag) {
// CSignature* pSig;
pSig = *m_Signatures << this_signature_string;
} else {
pSig = *m_PrefixSignatures << this_signature_string;
Expand All @@ -630,6 +723,26 @@ void CLexicon::step4_create_signatures(QString name_of_calling_function)
foreach (QString this_affix_t, this_affix_list){
step4a_link_signature_and_affix(pSig,this_affix_t);
}
if(false) {
// for each stem in the list of stems in the map of signatures,
// use the function link_signature_and_stem()
stem_list_iterator stem_iter(*p_this_stem_list);
while (stem_iter.hasNext()){
this_stem_t = stem_iter.next();
link_signature_and_stem(this_stem_t, pSig, this_signature_string);
// I think that all of the work of the next loop is already done inside "link_signature_and_stem";
if (false) {
affix_iter_2.toFront();
while(affix_iter_2.hasNext()){
this_affix_t = affix_iter_2.next();
if (this_affix_t == "NULL"){ this_affix_t = "";}
if (m_SuffixesFlag){ this_word = this_stem_t + this_affix_t;}
else { this_word = this_affix_t + this_stem_t;}
pWord = m_Words->find_or_fail(this_word);
pWord->add_to_autobiography(name_of_calling_function + "=" + this_stem_t );
}
}
} //end of if false
foreach (this_stem_t, *this_stem_set){
step4b_link_signature_and_stem_and_word(this_stem_t,pSig, this_signature_string);
// qDebug() << 679 << this_signature_string;
Expand Down Expand Up @@ -675,6 +788,9 @@ void CLexicon::step4a_link_signature_and_affix(CSignature * pSig, affix_t this_a
}
void CLexicon::step4b_link_signature_and_stem_and_word(stem_t this_stem_t , CSignature* pSig, QString this_signature_string)
{
// add a CStem object into m_suffixal_stems/m_prefixal_stems:
// - add CSignature pointer to that CStem object
// - add CStem pointer to CSignature object
CStem* pStem;
QString this_affix, this_word;
m_SuffixesFlag ?
Expand All @@ -697,7 +813,11 @@ void CLexicon::step4b_link_signature_and_stem_and_word(stem_t this_stem_t , CSig
}
CWord* pWord = m_Words->get_word(this_word);
if (!pWord){

qDebug() << this_word << "Error: this_word not found among words. Line 505" << this_stem_t << this_affix << pSig->get_key() << this_signature_string;

qDebug() << this_word << "Error: this_word not found among words. Line 577" << this_stem_t << this_affix << pSig->get_key() << this_signature_string;

} else{
stem_count += pWord->get_word_count();
pWord->add_parse_triple(this_stem_t, this_affix, pSig->get_key());
Expand All @@ -709,6 +829,8 @@ void CLexicon::step4b_link_signature_and_stem_and_word(stem_t this_stem_t , CSig
pStem->set_count(stem_count);

}


bool contains(QList<QString> * list2, QList<QString> * list1){
for (int i=0; i < list1->length();i++){
bool success = false;
Expand Down Expand Up @@ -994,8 +1116,3 @@ void CLexicon::collect_parasuffixes()
}
m_ParaSuffixes->sort_by_count();
};





34 changes: 31 additions & 3 deletions QtLing/Lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <QStringList>
#include "SignatureCollection.h"
#include "Typedefs.h"
#include "evaluation.h"


class MainWindow;
class CWordCollection;
Expand All @@ -19,7 +21,6 @@ class CSuffixCollection;
class CPrefixCollection;
class QProgressBar;
class CHypothesis;

class CParse;


Expand Down Expand Up @@ -166,9 +167,9 @@ class CLexicon
CPrefixCollection * m_Prefixes;
CSignatureCollection * m_Signatures;
CSignatureCollection * m_PrefixSignatures;
CWordCollection * m_Compounds;
CWordCollection * m_Compounds; // nothing done yet
//QList<QPair<QString,QString>> * m_Parses;
QList<CParse*> * m_Parses;
QList<CParse*> * m_Parses; //

// QMap<QString,int> m_Parse_map;
QMap<QString, protostem*> m_suffix_protostems;
Expand All @@ -177,23 +178,33 @@ class CLexicon
// m_protostems_2 is used in order to keep track of exactly which interval of words in the word list begins
// with a particular proto-stem (i.e., a word-beginning). This replaces using a huge signature to store
// that same information.

//QMap<QString, protostem*> m_suffix_protostems_2;
//QMap<QString, protostem*> m_prefix_protostems_2; // temporary data structures for crab1

//QMap<QString, protostem*> m_suffix_protostems_2;
//QMap<QString, protostem*> m_prefix_protostems_2;



bool m_SuffixesFlag;
CLexicon* m_parent_lexicon;

// all of the possible continuations
// affixes that are "thrown out" in Crab2
CSignatureCollection* m_ParaSignatures; /*!< the information we have about stems which we have not yet integrated into a morphological system. */
CSuffixCollection * m_ParaSuffixes;
CStemCollection * m_ResidualStems;
CSignatureCollection * m_ResidualPrefixSignatures;
CStemCollection * m_StemsFromSubsignatures;
CSignatureCollection* m_Subsignatures;

// Finds the difference between signatures, e.g. {ed, es, er, e, ing} vs {d, s, r, NULL}
QList<simple_sig_graph_edge*> m_SigGraphEdgeList; /*!< the sig_graph_edges in here contain only one word associated with each. */
lxa_sig_graph_edge_map m_SigGraphEdgeMap; /*!< the sig_graph_edges in here contain lists of words associated with them. */
CSignatureCollection * m_PassiveSignatures; /*!< these signatures have stems one letter off from another signature. */
CSignatureCollection * m_SequentialSignatures; /*! signatures where one affix leads to another signature. */
// Generalizes repeating
QList<CHypothesis*> * m_Hypotheses;
QMap<QString, CHypothesis*> * m_Hypothesis_map;
// add component 1
Expand All @@ -207,12 +218,29 @@ class CLexicon

double m_entropy_threshold_for_stems;

// experiment for gold standard evaluation code
GoldStandard* m_goldstandard;
EvalParses* m_eval_parses;
// end of experiment

public:
CLexicon(CLexicon* parent_lexicon = NULL, bool suffix_flag = true);
public:

~CLexicon();

// experiment for gold standard evaluation code
GoldStandard* get_goldstandard() { return m_goldstandard; }
GoldStandard* new_goldstandard_from_xml(QString& file_name);
void delete_goldstandard() { delete m_goldstandard; m_goldstandard = NULL; }
bool do_gs_evaluation();

EvalParses* get_eval_parses() { return m_eval_parses; }
EvalParses* new_eval_parses_from_txt(QString& file_name);
void delete_eval_parses();
bool do_gs_evaluation_on_eval_parses();


void dump_signatures_to_debug();
// accessors and protostems
void dump_suffixes(QList<QString>*);
Expand Down
13 changes: 12 additions & 1 deletion QtLing/QtLing.pro
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
#-------------------------------------------------

QT += core gui
QT += core gui xml

greaterThan(QT_MAJOR_VERSION, 4): QT += widgets

Expand Down Expand Up @@ -40,6 +40,14 @@ SOURCES += main.cpp\
signature_graph.cpp \
string_group.cpp \
cparse.cpp \
lexicon_json.cpp \
evaluation.cpp \
evaluation_goldstandard.cpp \
evaluation_evalparses.cpp \
evaluation_parsemap.cpp \
mainwindow_find.cpp \
mainwindow_menubar.cpp \
mainwindow_actions.cpp \
stringalignment.cpp \
allosignatures.cpp

Expand All @@ -65,6 +73,9 @@ HEADERS += mainwindow.h \
mainwindow.h \
string_group.h \
cparse.h \
evaluation.h \
mainwindow_find.h \
mainwindow_menubar.h \
stringalignment.h \
allosignatures.h

Expand Down
Loading