ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

基于COCA词频表的文本词汇分布测试工具v0.2

2020-07-04 16:38:11  阅读:211  来源: 互联网

标签:HEAD word COCA wfrq 词频 WORDS 测试工具 WORD define


update:

  • 简单整理了一下代码的组织。
  • 处理的单词封装成类,单词的修正,信息的显示都作为其内的方法。

 

写得还比较糙,工具本身可以封装,还有对于单词的变形基本没什么处理,以后有时间再改。

项目托管到github上了。https://github.com/MorpheusDong/TextVocabularyAnalyzer

 

TypeDefine.h

#ifndef _TYPE_DEFINE_H_
#define _TYPE_DEFINE_H_

#include <iostream>
#include <fstream>
#include <string>
#include <array>
#include <vector>
#include <iterator>
#include <map>

using namespace std;

#define COCA_WORDS_NUM                       20201U
#define WORDS_HEAD_NUM                       26U
                                             
#define WORDS_HEAD_A                         0U
#define WORDS_HEAD_B                         1U
#define WORDS_HEAD_C                         2U
#define WORDS_HEAD_D                         3U
#define WORDS_HEAD_E                         4U
#define WORDS_HEAD_F                         5U
#define WORDS_HEAD_G                         6U
#define WORDS_HEAD_H                         7U
#define WORDS_HEAD_I                         8U
#define WORDS_HEAD_J                         9U
#define WORDS_HEAD_K                         10U
#define WORDS_HEAD_L                         11U
#define WORDS_HEAD_M                         12U
#define WORDS_HEAD_N                         13U
#define WORDS_HEAD_O                         14U
#define WORDS_HEAD_P                         15U
#define WORDS_HEAD_Q                         16U
#define WORDS_HEAD_R                         17U
#define WORDS_HEAD_S                         18U
#define WORDS_HEAD_T                         19U
#define WORDS_HEAD_U                         20U
#define WORDS_HEAD_V                         21U
#define WORDS_HEAD_W                         22U
#define WORDS_HEAD_X                         23U
#define WORDS_HEAD_Y                         24U
#define WORDS_HEAD_Z                         25U
                                             
#define USUAL_WORD_NUM                       17U


typedef enum WordFrequencyType
{
    WORD_UNDER_4000 = 0,
    WORD_4000_6000,
    WORD_6000_8000,
    WORD_8000_10000,
    WORD_10000_12000,
    WORD_12000_14000,
    WORD_14000_16000,
    WORD_OVER_16000,
    WORD_NOT_FOUND_COCA,
    WORD_LEVEL_NUM
}TagWordFrequencyType;

const string alphabet_str = "abcdefghijklmnopqrstuvwxyz";

const string report_str[WORD_LEVEL_NUM] = {
    "UNDER 4000: ",
    "4000-6000: ",
    "6000-8000: ",
    "8000-10000: ",
    "10000-12000: ",
    "12000-14000: ",
    "14000-16000: ",
    "16000-20000+: ",
    "\nNot found in COCA:"
};

//for usual words not included in COCA
const string usual_w_out_of_COCA_str[USUAL_WORD_NUM] =
{
    "s","is","are","re","was","were",
    "an","won","t","has","had","been",
    "did","does","cannot","got","men"
};


#endif

 

TextVocabularyAnalyzer.h

#ifndef _TEXT_VOCABULARY_ANALYZER_H_
#define _TEXT_VOCABULARY_ANALYZER_H_

#include "TypeDefine.h"

extern TagWordFrequencyType frequency_classify(const int wfrq);
extern void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag);
extern bool isaletter(const char& c);

class CLetters
{
private:
    string m_word;

public:
    CLetters();
    ~CLetters();
    void fill(vector<char>& vw);
    const string word();
    const char firstletter();
    void processing();
    bool usual_recheck();
    bool form_recheck();
};



#endif // !_TEXT_VOCABULARY_ANALYZER_H_

 

TextVocabularyAnalyzer.cpp

/* TextVocabularyAnalyzer.cpp */

#include <algorithm>
#include "TextVocabularyAnalyzer.h"

TagWordFrequencyType frequency_classify(const int wfrq)
{
    if (wfrq == 0)
    {
        return WORD_NOT_FOUND_COCA;
    }
    else if (wfrq > 0 && wfrq <= 4000)
    {
        return WORD_UNDER_4000;
    }
    else if (wfrq > 4000 && wfrq <= 6000)
    {
        return WORD_4000_6000;
    }
    else if (wfrq > 6000 && wfrq <= 8000)
    {
        return WORD_6000_8000;
    }
    else if (wfrq > 8000 && wfrq <= 10000)
    {
        return WORD_8000_10000;
    }
    else if (wfrq > 10000 && wfrq <= 12000)
    {
        return WORD_10000_12000;
    }
    else if (wfrq > 12000 && wfrq <= 14000)
    {
        return WORD_12000_14000;
    }
    else if (wfrq > 14000 && wfrq <= 16000)
    {
        return WORD_14000_16000;
    }
    else
    {
        return WORD_OVER_16000;
    }
}

void word_frequency_analyze(array<int, WORD_LEVEL_NUM>& wfrq_array, TagWordFrequencyType wfrq_tag)
{
    switch (wfrq_tag)
    {
    case WORD_UNDER_4000:
    {
        wfrq_array[WORD_UNDER_4000] += 1;
        break;
    }
    case WORD_4000_6000:
    {
        wfrq_array[WORD_4000_6000] += 1;
        break;
    }
    case WORD_6000_8000:
    {
        wfrq_array[WORD_6000_8000] += 1;
        break;
    }
    case WORD_8000_10000:
    {
        wfrq_array[WORD_8000_10000] += 1;
        break;
    }
    case WORD_10000_12000:
    {
        wfrq_array[WORD_10000_12000] += 1;
        break;
    }
    case WORD_12000_14000:
    {
        wfrq_array[WORD_12000_14000] += 1;
        break;
    }
    case WORD_14000_16000:
    {
        wfrq_array[WORD_14000_16000] += 1;
        break;
    }
    case WORD_OVER_16000:
    {
        wfrq_array[WORD_OVER_16000] += 1;
        break;
    }
    default:
    {
        wfrq_array[WORD_NOT_FOUND_COCA] += 1;
        break;
    }
    }
}

bool isaletter(const char& c)
{
    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
    {
        return true;
    }
    else
    {
        return false;
    }
}


//Class Cletters realization
CLetters::CLetters()
{
    m_word = "";
}

CLetters::~CLetters()
{
    //do nothing
}

void CLetters::fill(vector<char>& vw)
{
    //store the word with lower form
    m_word.assign(vw.begin(), vw.end());
    transform(m_word.begin(), m_word.end(), m_word.begin(), tolower);
}

const string CLetters::word()
{
    return m_word;
}

const char CLetters::firstletter()
{
    return m_word[0];
}

void CLetters::processing()
{
    cout << "Finding word \"" << m_word << "\"...\t";
}


bool CLetters::usual_recheck()
{
    //check if the word is usual
    bool RetVal = false;
    for (int i = 0; i < USUAL_WORD_NUM; i++)
    {
        if (m_word == usual_w_out_of_COCA_str[i])
        {
            RetVal = true;
        }
        else
        {
            //do nothing
        }
    }
    return RetVal;
}

bool CLetters::form_recheck()
{
    bool RetVal = false;
    if (m_word.length() > 3)
    {
        char e1, e2, e3;
        e3 = m_word[m_word.length() - 3];    //last but two letter
        e2 = m_word[m_word.length() - 2];    //last but one letter
        e1 = m_word[m_word.length() - 1];    //last letter

        if (e1 == 's')
        {
            m_word.erase(m_word.length() - 1);
            RetVal = true;
        }
        else if (e2 == 'e' && e1 == 'd')
        {
            m_word.erase(m_word.length() - 1);
            m_word.erase(m_word.length() - 1);
            RetVal = true;
        }
        else if (e3 == 'i' && e2 == 'n' && e1 == 'g')
        {
            m_word.erase(m_word.length() - 1);
            m_word.erase(m_word.length() - 1);
            m_word.erase(m_word.length() - 1);
            RetVal = true;
        }
        else
        {
            //do nothing
        }
    }
    return RetVal;
}

 

main.cpp

/* main .cpp */

#include <numeric>
#include <iomanip>
#include <ctime>
#include "TextVocabularyAnalyzer.h"

int main()
{
    //file init
    ifstream COCA_txt("D:\\COCA.txt");
    ifstream USER_txt("D:\\JobsSpeech.txt");

    //time init
    clock_t startTime, endTime;
    double build_map_time = 0;
    double process_time = 0;

    startTime = clock();    //build time start

    //build COCA words map
    map<string, int> COCA_WordsList[WORDS_HEAD_NUM];
    int readlines = 0;

    while (readlines < COCA_WORDS_NUM)
    {
        int frequency = 0; string word = "";
        COCA_txt >> frequency;
        COCA_txt >> word;

        //transform to lower uniformly 
        transform(word.begin(), word.end(), word.begin(), tolower);

        //import every word
        for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
        {
            //check word head 
            if (word[0] == alphabet_str[whead])
            {
                //if a word already exists, only load its lower frequency
                if (COCA_WordsList[whead].find(word) == COCA_WordsList[whead].end())
                {
                    COCA_WordsList[whead].insert(make_pair(word, frequency));
                }
                else
                {
                    COCA_WordsList[whead][word] = frequency < COCA_WordsList[whead][word] ? frequency : COCA_WordsList[whead][word];
                }
            }
            else
            {
                // do nothing
            }
        }
        readlines++;
    }

    endTime = clock();    //build time stop
    build_map_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;

    //user prompt
    cout << "COCA words list imported.\nPress any key to start frequency analysis...\n";
    cin.get();

    startTime = clock();    //process time start

    //find text words
    vector<char> content_read;
    CLetters word_readed;
    vector<int> frequecy_processed = { 0 };
    array<int, WORD_LEVEL_NUM> words_analysis_array{ 0 };
    char char_read = ' ';

    //get text char one by one
    while (USER_txt.get(char_read))
    {
        //only letters and '-' between letters will be received
        if (isaletter(char_read) || char_read == '-')
        {
            content_read.push_back(char_read);
        }
        else
        {
            //char which is not a letter marks the end of a word
            if (!content_read.empty())    //skip single letter 
            {
                int current_word_frequency = 0;

                //assign letters to make the word
                word_readed.fill(content_read);
                word_readed.processing();

                cout << "Frequency:";
                //check the word's head and find its frequency in COCA list
                for (int whead = WORDS_HEAD_A; whead < WORDS_HEAD_NUM; whead++)
                {
                    if (word_readed.firstletter() == alphabet_str[whead])
                    {
                        cout << COCA_WordsList[whead][word_readed.word()];
                        current_word_frequency = COCA_WordsList[whead][word_readed.word()];

                        //check if the word has been processed
                        if (current_word_frequency == 0)
                        {
                            //addtional check
                            if (word_readed.usual_recheck())
                            {
                                word_frequency_analyze(words_analysis_array, WORD_UNDER_4000);
                            }
                            else if (word_readed.form_recheck())
                            {
                                current_word_frequency = COCA_WordsList[whead][word_readed.word()];    //try again
                                if (current_word_frequency > 0)
                                {
                                    frequecy_processed.push_back(current_word_frequency);
                                    word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
                                }
                                else
                                {
                                    // do nothing
                                }
                            }
                            else
                            {
                                word_frequency_analyze(words_analysis_array, WORD_NOT_FOUND_COCA);
                            }
                        }
                        else if (find(frequecy_processed.begin(), frequecy_processed.end(), current_word_frequency)
                            == frequecy_processed.end())
                        {
                            //classify this word and make statistics
                            frequecy_processed.push_back(current_word_frequency);
                            word_frequency_analyze(words_analysis_array, frequency_classify(current_word_frequency));
                        }
                        else
                        {
                            // do nothing
                        }
                    }
                    else
                    {
                        //do nothing
                    }
                }
                cout << endl;

                content_read.clear();
            }
            else
            {
                //do nothing
            }
        }
    }

    endTime = clock();    //process time stop
    process_time = (double)(endTime - startTime) / CLOCKS_PER_SEC;

    //calc whole words processed
    int whole_words_analyzed = 0;
    whole_words_analyzed = accumulate(words_analysis_array.begin(), words_analysis_array.end(), 0);

    //report result
    cout << "\n////////// Report ////////// \n";
    for (int i = 0;i< words_analysis_array.size();i++)
    {
        cout << report_str[i] <<"\t"<< words_analysis_array[i] << " (";
        cout<<fixed<<setprecision(2)<<(float)words_analysis_array[i] * 100 / whole_words_analyzed << "%)" << endl;
    }
    cout << "\nWords totally analyzed: " << whole_words_analyzed << endl;

    //show run time
    cout << "Map build time: " << build_map_time*1000 << "ms.\n";
    cout << "Process time: " << process_time*1000 << "ms.\n";
    cout << "////////////////////////////" << endl;

    //close file
    COCA_txt.close();
    USER_txt.close();

    return 0;
}

 

标签:HEAD,word,COCA,wfrq,词频,WORDS,测试工具,WORD,define
来源: https://www.cnblogs.com/banmei-brandy/p/13235125.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有