phonyland/ngram

🧪 N-Gram Tools for 🙃 Phony Language Models with sanitizing, tokenization, n-gram extraction, frequency mapping

dev-master 2024-05-22 20:14 UTC

This package is auto-updated.

Last update: 2024-11-22 21:13:55 UTC


README

Phony Logo - Light Phony Logo - Dark

🧪
N-Gram Tools

This repository contains the N-Gram Tools for 🙃 Phony Language that includes features like sanitizing, tokenization, n-gram extraction, frequency mapping.

🚀 Installation

Requires PHP >= 8.0.

You can install the package via composer:

composer require phonyland/ngram

⌨️ Usage

Tokenizer

Word Tokenization

$tokenizer->tokenize($text);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer
  ->addWordSeparatorPattern(';')
  ->addWordSeparatorPattern('\s')
  ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS);

$text = 'sample   text;sample;text';

$tokenizer->tokenize($text);
🖥 Output
[
    "sample",
    "text",
    "sample",
    "text",
];

Sentence Tokenization

$tokenizer->sentences($text);
⌨️ Usage
use Phonyland\NGram\Tokenizer;

$tokenizer = new Tokenizer();
$tokenizer
  ->addSentenceSeparatorPattern('.')
  ->addSentenceSeparatorPattern('!')
  ->addSentenceSeparatorPattern('?');

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

$tokenizer->sentences($text);
🖥 Output
[
    "Sample Sentence.",
    "Sample Sentence!",
    "Sample Sentence?",
    "Sample Sentence no.",
    "4?!",
    "Sample sample sentence...",
    "End",
];

Word Tokenization by Sentences

$tokenizer->tokenizeBySentences($text);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer
  ->addSentenceSeparatorPattern('.')
  ->addSentenceSeparatorPattern('!')
  ->addSentenceSeparatorPattern('?')
  ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
  ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

$tokenizer->tokenizeBySentences($text);
🖥 Output
[
    ["Sample", "Sentence"],
    ["Sample", "Sentence"],
    ["Sample", "Sentence"],
    ["Sample", "Sentence", "no"],
    ["Sample", "sample", "sentence"],
    ["End"],
];

N-Gram

N-Gram Sequence

NGramSequence::multigram($n, $tokens, $isUnique);
NGramSequence::trigram($tokens, $isUnique);
NGramSequence::bigram($tokens, $isUnique);
NGramSequence::unigram($tokens, $isUnique);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramSequence;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokens = $tokenizer->tokenize('sample text');

NGramSequence::multigram(4, $tokens);
// ['samp', 'ampl', 'mple', 'text'];

// Generate Unique N-Grams 
NGramSequence::unigram($tokens, true);
// ['s', 'a', 'm', 'p', 'l', 'e', 't', 'x'];

N-Gram Sequences with Count

NGramCount::multigram(4, $tokens);
NGramCount::trigram($tokens);
NGramCount::bigram($tokens);
NGramCount::unigram($tokens);

NGramCount::incrementElementCount($element, $elements);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramCount;

$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokens = $tokenizer->tokenize('sample text');

NGramCount::multigram(4, $tokens);
// [
//     'samp' => 1,
//     'ampl' => 1,
//     'mple' => 1,
//     'text' => 1,
// ];

N-Gram Frequency

NGramFrequency::multigram(4, $tokens);
NGramFrequency::multigram($tokens);
NGramFrequency::bigram($tokens);
NGramFrequency::unigram($tokens);

NGramFrequency::frequencyFromCount($countArray);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramFrequency;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokenizer->addWordFilterRule(TokenizerFilterType::ALPHABETICAL);
$tokens = $tokenizer->tokenize('bombadil! bombadillo!');

NGramFrequency::multigram(4, $tokens);
//[
//    'bomb' => 0.16666666666666666,
//    'omba' => 0.16666666666666666,
//    'mbad' => 0.16666666666666666,
//    'badi' => 0.16666666666666666,
//    'adil' => 0.16666666666666666,
//    'dill' => 0.08333333333333333,
//    'illo' => 0.08333333333333333,
//]

🙃

Start generating fake data with 🙃 Phony Framework,
visit the main Phony Repository.

Explore the docs » https://phony.land
Follow us on Twitter » @phony_land

🙃 Phony
Fake Data Generation Framework

was created by
Yunus Emre Deligöz
under
MIT license.