-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Add an advanced text generator based on markov chains. #254
Changes from 6 commits
09e4b3a
8e1a337
36c5974
84dd82c
08069d4
5582419
8667c48
4bdebac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,6 +117,10 @@ Each of the generator properties (like `name`, `address`, and `lorem`) are calle | |
paragraphs($nb = 3) // array('Quidem ut sunt et quidem est accusamus aut. Fuga est placeat rerum ut. Enim ex eveniet facere sunt.', 'Aut nam et eum architecto fugit repellendus illo. Qui ex esse veritatis.', 'Possimus omnis aut incidunt sunt. Asperiores incidunt iure sequi cum culpa rem. Rerum exercitationem est rem.') | ||
text($maxNbChars = 200) // 'Fuga totam reiciendis qui architecto fugiat nemo. Consequatur recusandae qui cupiditate eos quod.' | ||
|
||
### `Faker\Provider\Text` | ||
|
||
realText($maxNbChars = 200, $indexSize = 2) // 'At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur.' | ||
|
||
### `Faker\Provider\Internet` | ||
|
||
email // '[email protected]' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
<?php | ||
|
||
namespace Faker\Provider; | ||
|
||
class Text extends \Faker\Provider\Base | ||
{ | ||
protected static $baseText = <<<'EOT' | ||
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. | ||
At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit | ||
amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et | ||
justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. | ||
EOT; | ||
protected static $normalizedText = null; | ||
protected static $tables = array(); | ||
|
||
/** | ||
* Generate a text string by the Markov chain algorithm. | ||
* Depending on the $maxNbChars, returns a random valid looking text. The algorithm | ||
* generates a weighted table with the specified number of words as the index and the | ||
* possible following words as the value. | ||
* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest you add a comment about the Markow chain algorithm. |
||
* @example 'Lorem ipsum dolor sit amet' | ||
* @param integer $maxNbChars Maximum number of characters the text should contain (minimum: 10) | ||
* @param integer $indexSize Determines how many words are considered for the generation of the next word. The minimum is 1, and it produces the higher level of randomness, although the | ||
* generated text usually doesn't make sense. Higher index size (up to 10) produce more correct text, at the price of less randomness. | ||
* @return string | ||
*/ | ||
public static function realText($maxNbChars = 200, $indexSize = 2) | ||
{ | ||
if ($maxNbChars < 10) { | ||
throw new \InvalidArgumentException('maxNbChars must be at least 10'); | ||
} | ||
|
||
if ($indexSize < 1) { | ||
throw new \InvalidArgumentException('indexSize must be at least 1'); | ||
} | ||
|
||
if ($indexSize > 10) { | ||
throw new \InvalidArgumentException('indexSize must be at most 10'); | ||
} | ||
|
||
if (!isset(static::$tables[$indexSize])) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I see a potential bug when switching locales. $faker = Faker\Factory::create('fr_FR');
$faker->realText(100); // generates static $table cache for French locale
$faker = Faker\Factory::create('en_EN');
$faker->realText(100); // uses static $table cache for French locale, generates French text That probably means that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, |
||
$text = static::getNormalizedText(); | ||
|
||
// split into look up parts | ||
$parts = preg_split('/ /u', $text, -1, PREG_SPLIT_NO_EMPTY); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just out of curiosity, why not use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, sorry, I meant |
||
|
||
// generate look up table | ||
$table = array(); | ||
for ($i = $indexSize, $max = count($parts) - 1; $i < $max; $i++) { | ||
// calculate index | ||
$index = implode(' ', array_slice($parts, $i - $indexSize, $indexSize)); | ||
if (!isset($table[$index])) $table[$index] = array(); | ||
|
||
// value: next part | ||
$table[$index][] = $parts[$i]; | ||
} | ||
|
||
// cache look up table for performance | ||
static::$tables[$indexSize] = $table; | ||
} | ||
|
||
$table = static::$tables[$indexSize]; | ||
$result = array(); | ||
$resultLength = 0; | ||
|
||
// take a random starting point | ||
$next = static::randomKey($table); | ||
while ($resultLength < $maxNbChars && isset($table[$next])) { | ||
// fetch a random element to append | ||
$append = static::randomElement($table[$next]); | ||
|
||
// calculate next index | ||
$next = preg_split('/ /u', $next, -1, PREG_SPLIT_NO_EMPTY); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use |
||
$next[] = $append; | ||
array_shift($next); | ||
$next = implode(' ', $next); | ||
|
||
// ensure text starts with an uppercase letter | ||
if ($resultLength == 0 && !preg_match('/^\p{Lu}/u', $append)) continue; | ||
|
||
// append the element | ||
$result[] = $append; | ||
$resultLength += strlen($append); | ||
} | ||
|
||
// remove the element that caused the text to overflow | ||
array_pop($result); | ||
|
||
// build result | ||
$result = implode(' ', $result); | ||
|
||
return $result.'.'; | ||
} | ||
|
||
protected static function getNormalizedText() | ||
{ | ||
if (static::$normalizedText === null) { | ||
static::$normalizedText = static::$baseText; | ||
static::$normalizedText = preg_replace('/\s+/', ' ', static::$normalizedText); | ||
} | ||
|
||
return static::$normalizedText; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this provider contains English text, it should probably be under the
en_US
locale. So I think you should commit one class with no locale and a basic text (see for instance the base Person generator), and another one, more complete with theen_US
locale.The non-locale Text provider can even be abstract, to force a proper text.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I'll change it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After thinking about it, putting lorem ipsum text in this class brings confusion with the text class. That's why you should leave the
$baseText
empty and make the class abstract, to be instanciated in locales.