前一段时间写的aurora，用来统计文本文件里单词的数量的

前一段时间写的aurora，用来统计文本文件里单词的数量的。

源代码如下。

//// console版

#include "cai.h"

#include <iostream>

using namespace std;

// cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq

main(int argc, char* argv[])

{

ios::sync_with_stdio(false);

error err = OK;

cmd_argument cmd;

cmd = parse_cmd_argument(argc, argv, err);

cout << "In memeory of Aurora(1984)" << endl;

cout << "Programming by [email protected]" << endl;

if (err!=OK)

{

cout << "cmd arguments parsing error." << endl;

cout << "cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq|alph" << endl;

}

input_text(cmd.text_file_name, err);

get_skip_table(cmd.skip_table_file_name, err);

get_dup_table(cmd.dup_table_file_name, err);

get_delimiter_table(cmd.delimiter_table_file_name, err);

segement();

remove_skip();

remove_dup();

output_vocabulary(cmd.sort);

}

/////

/// cai.h for console version.

/////

#include <vector>

#include <string>

#include <map>

#include <iostream>

#include <fstream>

#include <sstream>

#include <algorithm>

#include <cctype>

using namespace std;

struct cmd_argument

{

string text_file_name;

string delimiter_table_file_name;

string skip_table_file_name;

string dup_table_file_name;

string sort;

};

vector<string> skip_table;

vector<char> delimiter_table;

map<string, string> dup_table;//varint(meaning) for key, root(mean) for value

string whole_text;

vector<string> text_segmented;

vector<string> text_skip_removed;

vector<string> text_dup_removed;

map<string, int> voc_output;

enum error

{

OK,

cmd_pasrsing_error

};

// cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq

cmd_argument parse_cmd_argument(int argc, char* argv[], error& err)

{

cmd_argument cmd;

if (argc != 11)

{

err = cmd_pasrsing_error;

}

else

{

cmd.text_file_name = argv[2];

cmd.delimiter_table_file_name = argv[4];

cmd.skip_table_file_name = argv[6];

cmd.dup_table_file_name = argv[8];

cmd.sort = argv[10];

}

return cmd;

}

void input_text(string file_name, error& err)

{

// cout << "input:" << file_name << endl << endl;

string line;

int line_count = 0;

ifstream f(file_name.c_str());

while(getline(f, line))

{

whole_text += line+" ";

++line_count;

}

string foo = "Some Mixed Case Text";

transform(whole_text.begin(), whole_text.end(), whole_text.begin(), ::tolower);

// cout << line_count << " lines" << endl;

}

void get_skip_table(string file_name, error& err)

{

// cout << "skip:" << file_name << endl << endl;

string line;

ifstream f(file_name.c_str());

while(getline(f, line))

{

skip_table.push_back(line);

}

void get_dup_table(string file_name, error& err)

{

// cout << "dup:" << file_name << endl << endl;

string line;

ifstream f(file_name.c_str());

stringstream ss;

string variant;

string root;

string word;

// cout << "---" << endl;

while(getline(f, line))

{

ss.clear();

ss << line;

for (int i = 0; ss >> word; ++i)

{

if (i==0)

{

root = word;

}

else

{

variant = word;

dup_table[variant]=root;

}

void get_delimiter_table(string file_name, error& err)

{

// cout << "delimiter:" << file_name << endl << endl;

char space;

ifstream f(file_name.c_str());

while(f >> space)

{

delimiter_table.push_back(space);

}

// whole_text is fileted with delimiter talbe into text_segmented

void segement()

{

// cout << "skip:" ;

/* for (vector<string>::iterator i = skip_table.begin(); i!=skip_table.end() ; ++i) */

/* { */

/* cout << *i << "t"; */

/* } */

// cout << endl << endl;

// cout << "dup:" << endl;

/* for (map<string, string>::iterator i = dup_table.begin(); i!=dup_table.end() ; ++i) */

/* { */

/* cout << i->first << "t" << i->second << endl; */

/* } */

// cout << endl << endl;

/* cout << "delimiter:" << endl; */

/* for (vector<char>::iterator i = delimiter_table.begin(); i!=delimiter_table.end() ; ++i) */

/* { */

/* cout << *i << "t"; */

/* } */

/* cout << endl << endl; */

///

for (vector<char>::iterator i=delimiter_table.begin(); i != delimiter_table.end(); ++i)

{

replace(whole_text.begin(), whole_text.end(), *i, ' ');

}

stringstream ss;

ss << whole_text;

string word;

while(getline(ss, word, ' '))

{

if (word!="")

{

text_segmented.push_back(word);

}

/* cout << "----------" << endl; */

/* for (vector<string>::iterator i=text_segmented.begin(); i != text_segmented.end(); ++i) */

/* { */

/* cout << *i << endl; */

/* } */

}

void remove_skip()

{

bool skip = false;

for (vector<string>::iterator i = text_segmented.begin(); i != text_segmented.end(); ++i)

{

for (vector<string>::iterator j = skip_table.begin(); j != skip_table.end(); ++j)

{

if (*i == *j)

{

skip = true;

break;

}

if (skip==false)

{

text_skip_removed.push_back(*i);

}

else

{

skip=false;

}

/* for (vector<string>::iterator i=text_skip_removed.begin(); i != text_skip_removed.end(); ++i) */

/* { */

/* cout << *i << endl; */

/* } */

}

void remove_dup()

{

for (map<string, string>::iterator j = dup_table.begin(); j != dup_table.end(); ++j)

{

replace(text_skip_removed.begin(), text_skip_removed.end(), j->first, j->second);

}

text_dup_removed = text_skip_removed;

/* for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i) */

/* { */

/* cout << *i << endl; */

/* } */

}

bool more_freq (const pair<string,int>& a, const pair<string,int>& b)

{

return a.second > b.second;

}

void output_vocabulary(string sorting)

{

for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i)

{

voc_output[*i]++;

}

if (sorting=="freq")

{

vector<pair<string, int> > voc; //(voc_output.begin(), voc_output.end());

for (map<string,int>::iterator i=voc_output.begin();

i != voc_output.end();

++i)

{

voc.push_back(*i);

}

sort(voc.begin(), voc.end(), more_freq);

cout << "-------------------" << endl;

cout << "vocabularytcount sorted by freq" << endl;

for (vector<pair<string, int> >::iterator i = voc.begin(); i != voc.end(); ++i)

{

cout << i->first << "tt" << i->second << endl;

}

else

{

cout << "-------------------" << endl;

cout << "vocabularytcount sorted by alph" << endl;

for (map<string,int>::iterator i=voc_output.begin(); i != voc_output.end(); ++i)

{

cout << i->first << "tt" << i->second << endl;

}

//// GUI version

///auroraDlg.cpp

//////////////

void CAuroraDlg::OnOK()

{

const char pszFilter[] = _T("TXT File (*.txt)|*.txt|All Files (*.*)|*.*||");

CFileDialog dlg(TRUE, NULL, NULL, OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT, pszFilter, this);

CString strFilePath;

if(dlg.DoModal() == IDOK)

{

strFilePath = dlg.GetPathName();

}

char file_name[1024];

strcpy(file_name, (LPCTSTR)strFilePath );

error err = OK;

m_text_out = "reading...";

UpdateData(FALSE);

input_text(file_name, err);

m_text_out = "reading...done";

UpdateData(FALSE);

get_skip_table("skip.txt", err);

get_dup_table("dup.txt", err);

get_delimiter_table("space.txt", err);

m_text_out = "segmenting the text...";

UpdateData(FALSE);

segement();

m_text_out = "segmenting the text...done";

UpdateData(FALSE);

m_text_out = "removing spaces...";

UpdateData(FALSE);

remove_skip();

m_text_out = "removing spaces...done";

UpdateData(FALSE);

m_text_out = "merging the words duplicated...";

UpdateData(FALSE);

remove_dup();

m_text_out = "merging the words duplicated...done";

UpdateData(FALSE);

m_text_out = "sorting...";

UpdateData(FALSE);

output_vocabulary("freq");

m_text_out = "sorting...done";

UpdateData(FALSE);

m_text_out = text_out.c_str();

UpdateData(FALSE);

}

//// cai.h for GUI.h

#include <vector>

#include <string>

#include <map>

#include <iostream>

#include <fstream>

#include <sstream>

#include <algorithm>

#include <cctype>

using namespace std;

struct cmd_argument

{

string text_file_name;

string delimiter_table_file_name;

string skip_table_file_name;

string dup_table_file_name;

string sort;

};

vector<string> skip_table;

vector<char> delimiter_table;

map<string, string> dup_table;//varint(meaning) for key, root(mean) for value

string whole_text;

vector<string> text_segmented;

vector<string> text_skip_removed;

vector<string> text_dup_removed;

map<string, int> voc_output;

string text_out;

enum error

{

OK,

cmd_pasrsing_error

};

// cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq

cmd_argument parse_cmd_argument(int argc, char* argv[], error& err)

{

cmd_argument cmd;

if (argc != 11)

{

err = cmd_pasrsing_error;

}

else

{

cmd.text_file_name = argv[2];

cmd.delimiter_table_file_name = argv[4];

cmd.skip_table_file_name = argv[6];

cmd.dup_table_file_name = argv[8];

cmd.sort = argv[10];

}

return cmd;

}

void input_text(string file_name, error& err)

{

// cout << "input:" << file_name << endl << endl;

string line;

int line_count = 0;

ifstream f(file_name.c_str());

while(getline(f, line))

{

whole_text += line+" ";

++line_count;

}

string foo = "Some Mixed Case Text";

transform(whole_text.begin(), whole_text.end(), whole_text.begin(), ::tolower);

// cout << line_count << " lines" << endl;

}

void get_skip_table(string file_name, error& err)

{

// cout << "skip:" << file_name << endl << endl;

string line;

ifstream f(file_name.c_str());

while(getline(f, line))

{

skip_table.push_back(line);

}

void get_dup_table(string file_name, error& err)

{

// cout << "dup:" << file_name << endl << endl;

string line;

ifstream f(file_name.c_str());

stringstream ss;

string variant;

string root;

string word;

// cout << "---" << endl;

while(getline(f, line))

{

ss.clear();

ss << line;

for (int i = 0; ss >> word; ++i)

{

if (i==0)

{

root = word;

}

else

{

variant = word;

dup_table[variant]=root;

}

void get_delimiter_table(string file_name, error& err)

{

// cout << "delimiter:" << file_name << endl << endl;

char space;

ifstream f(file_name.c_str());

while(f >> space)

{

delimiter_table.push_back(space);

}

// whole_text is fileted with delimiter talbe into text_segmented

void segement()

{

// cout << "skip:" ;

/* for (vector<string>::iterator i = skip_table.begin(); i!=skip_table.end() ; ++i) */

/* { */

/* cout << *i << "t"; */

/* } */

// cout << endl << endl;

// cout << "dup:" << endl;

/* for (map<string, string>::iterator i = dup_table.begin(); i!=dup_table.end() ; ++i) */

/* { */

/* cout << i->first << "t" << i->second << endl; */

/* } */

// cout << endl << endl;

/* cout << "delimiter:" << endl; */

/* for (vector<char>::iterator i = delimiter_table.begin(); i!=delimiter_table.end() ; ++i) */

/* { */

/* cout << *i << "t"; */

/* } */

/* cout << endl << endl; */

///

for (vector<char>::iterator i=delimiter_table.begin(); i != delimiter_table.end(); ++i)

{

replace(whole_text.begin(), whole_text.end(), *i, ' ');

}

stringstream ss;

ss << whole_text;

string word;

while(getline(ss, word, ' '))

{

if (word!="")

{

text_segmented.push_back(word);

}

/* cout << "----------" << endl; */

/* for (vector<string>::iterator i=text_segmented.begin(); i != text_segmented.end(); ++i) */

/* { */

/* cout << *i << endl; */

/* } */

}

void remove_skip()

{

bool skip = false;

for (vector<string>::iterator i = text_segmented.begin(); i != text_segmented.end(); ++i)

{

for (vector<string>::iterator j = skip_table.begin(); j != skip_table.end(); ++j)

{

if (*i == *j)

{

skip = true;

break;

}

if (skip==false)

{

text_skip_removed.push_back(*i);

}

else

{

skip=false;

}

/* for (vector<string>::iterator i=text_skip_removed.begin(); i != text_skip_removed.end(); ++i) */

/* { */

/* cout << *i << endl; */

/* } */

}

void remove_dup()

{

for (map<string, string>::iterator j = dup_table.begin(); j != dup_table.end(); ++j)

{

replace(text_skip_removed.begin(), text_skip_removed.end(), j->first, j->second);

}

text_dup_removed = text_skip_removed;

/* for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i) */

/* { */

/* cout << *i << endl; */

/* } */

}

bool more_freq (const pair<string,int>& a, const pair<string,int>& b)

{

return a.second > b.second;

}

void output_vocabulary(string sorting)

{

for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i)

{

voc_output[*i]++;

}

if (sorting=="freq")

{

vector<pair<string, int> > voc; //(voc_output.begin(), voc_output.end());

for (map<string,int>::iterator i=voc_output.begin();

i != voc_output.end();

++i)

{

voc.push_back(*i);

}

sort(voc.begin(), voc.end(), more_freq);

// cout << "-------------------" << endl;

// cout << "vocabularytcount sorted by freq" << endl;

text_out = "vocabularytcount sorted by freqrn";

vector<pair<string, int> >::iterator j;

for (j = voc.begin(); j != voc.end(); ++j)

{

//cout << j->first << "tt" << j->second << endl;

stringstream ss;

string num;

ss << j->second;

ss >> num;

text_out += j->first;

text_out += "tt";

text_out += num;

text_out += "rn";

}

else

{

// cout << "-------------------" << endl;

// cout << "vocabularytcount sorted by alph" << endl;

text_out = "vocabularytcount sorted by alphr";

map<string,int>::iterator k;

for (k=voc_output.begin(); k != voc_output.end(); ++k)

{

//cout << k->first << "tt" << k->second << endl;

stringstream ss;

string num;

ss << k->second;

ss >> num;

text_out += k->first;

text_out += "tt";

text_out += num;

text_out += "rn";

}

/////////////////////////

///////// diff console GUI

$ diff cai.h GUI/aurora/cai.h

29a30

> string text_out;

249,253c250,264

< cout << "-------------------" << endl;

< cout << "vocabularytcount sorted by freq" << endl;

< for (vector<pair<string, int> >::iterator i = voc.begin(); i != voc.end(); ++i)

< {

< cout << i->first << "tt" << i->second << endl;

---

> // cout << "-------------------" << endl;

> // cout << "vocabularytcount sorted by freq" << endl;

> text_out = "vocabularytcount sorted by freqrn";

> vector<pair<string, int> >::iterator j;

> for (j = voc.begin(); j != voc.end(); ++j)

> {

> //cout << j->first << "tt" << j->second << endl;

> stringstream ss;

> string num;

> ss << j->second;

> ss >> num;

> text_out += j->first;

> text_out += "tt";

> text_out += num;

> text_out += "rn";

258,262c269,283

< cout << "-------------------" << endl;

< cout << "vocabularytcount sorted by alph" << endl;

< for (map<string,int>::iterator i=voc_output.begin(); i != voc_output.end(); ++i)

< {

< cout << i->first << "tt" << i->second << endl;

---

> // cout << "-------------------" << endl;

> // cout << "vocabularytcount sorted by alph" << endl;

> text_out = "vocabularytcount sorted by alphr";

> map<string,int>::iterator k;

> for (k=voc_output.begin(); k != voc_output.end(); ++k)

> {

> //cout << k->first << "tt" << k->second << endl;

> stringstream ss;

> string num;

> ss << k->second;

> ss >> num;

> text_out += k->first;

> text_out += "tt";

> text_out += num;

> text_out += "rn";

前一段时间写的aurora，用来统计文本文件里单词的数量的

One thought on “前一段时间写的aurora，用来统计文本文件里单词的数量的”

Leave a Reply Cancel reply