97 lines
4.3 KiB
Plaintext
97 lines
4.3 KiB
Plaintext
include "NSLog.incl"
|
|
|
|
local fn WordFrequency( textStr as CFStringRef, caseSensitive as Boolean, ascendingOrder as Boolean ) as CFStringRef
|
|
'~'1
|
|
CFStringRef wrd
|
|
CFDictionaryRef dict
|
|
|
|
// Depending on the value of the caseSensitive Boolean function parameter above, lowercase incoming text
|
|
if caseSensitive == NO then textStr = fn StringLowercaseString( textStr )
|
|
|
|
// Trim non-alphabetic characters from string, and separate individual words with a space
|
|
CFStringRef tempStr = fn ArrayComponentsJoinedByString( fn StringComponentsSeparatedByCharactersInSet( textStr, fn CharacterSetInvertedSet( fn CharacterSetLetterSet ) ), @" " )
|
|
|
|
// Prepare separators to parse string into array
|
|
CFMutableCharacterSetRef separators = fn MutableCharacterSetInit
|
|
|
|
// Informally, this set is the set of all non-whitespace characters used to separate linguistic units in scripts, such as periods, dashes, parentheses, and so on.
|
|
MutableCharacterSetFormUnionWithCharacterSet( separators, fn CharacterSetPunctuationSet )
|
|
|
|
// A character set containing all the whitespace and newline characters including characters in Unicode General Category Z*, U+000A U+000D, and U+0085.
|
|
MutableCharacterSetFormUnionWithCharacterSet( separators, fn CharacterSetWhitespaceAndNewlineSet )
|
|
|
|
// Create array of separated words
|
|
CFArrayRef tempArr = fn StringComponentsSeparatedByCharactersInSet( tempStr, separators )
|
|
|
|
// Create a counted set with each word and its frequency
|
|
CountedSetRef freqencies = fn CountedSetWithArray( tempArr )
|
|
|
|
// Enumerate each word-frequency pair in the counted set...
|
|
EnumeratorRef enumRef = fn CountedSetObjectEnumerator( freqencies )
|
|
|
|
// .. and use it to create array of words in counted set
|
|
CFArrayRef array = fn EnumeratorAllObjects( enumRef )
|
|
|
|
// Create an empty mutable array
|
|
CFMutableArrayRef wordArr = fn MutableArrayWithCapacity( 0 )
|
|
|
|
// Create word counter
|
|
NSInteger totalWords = 0
|
|
// Enumerate each unique word, get its frequency, create its own key/value pair dictionary, add each dictionary into master array
|
|
for wrd in array
|
|
totalWords++
|
|
// Create dictionary with frequency and matching word
|
|
dict = @{ @"count":fn NumberWithUnsignedInteger( fn CountedSetCountForObject( freqencies, wrd ) ), @"object":wrd }
|
|
// Add each dictionary to the master mutable array, checking for a valid word by length
|
|
if ( fn StringLength( wrd ) != 0 )
|
|
MutableArrayAddObject( wordArr, dict )
|
|
end if
|
|
next
|
|
|
|
// Store the total words as a global application property
|
|
AppSetProperty( @"totalWords", fn StringWithFormat( @"%d", totalWords - 1 ) )
|
|
|
|
// Sort the array in ascending or descending order as determined by the ascendingOrder Boolean function input parameter
|
|
SortDescriptorRef descriptors = fn SortDescriptorWithKey( @"count", ascendingOrder )
|
|
CFArrayRef sortedArray = fn ArraySortedArrayUsingDescriptors( wordArr, @[descriptors] )
|
|
|
|
// Create an empty mutable string
|
|
CFMutableStringRef mutStr = fn MutableStringWithCapacity( 0 )
|
|
|
|
// Use each dictionary in sorted array to build the formatted output string
|
|
NSInteger count = 1
|
|
for dict in sortedArray
|
|
MutableStringAppendString( mutStr, fn StringWithFormat( @"%-7d %-7lu %@\n", count, fn StringIntegerValue( fn DictionaryValueForKey( dict, @"count" ) ), fn DictionaryValueForKey( dict, @"object" ) ) )
|
|
count++
|
|
next
|
|
|
|
// Create an immutable output string from mutable the string
|
|
CFStringRef resultStr = fn StringWithFormat( @"%@", mutStr )
|
|
end fn = resultStr
|
|
|
|
|
|
local fn ParseTextFromWebsite( webSite as CFStringRef )
|
|
// Convert incoming string to URL
|
|
CFURLRef textURL = fn URLWithString( webSite )
|
|
// Read contents of URL into a string
|
|
CFStringRef textStr = fn StringWithContentsOfURL( textURL, NSUTF8StringEncoding, NULL )
|
|
|
|
// Start timer
|
|
CFAbsoluteTime startTime = fn CFAbsoluteTimeGetCurrent
|
|
// Calculate frequency of words in text and sort by occurrence
|
|
CFStringRef frequencyStr = fn WordFrequency( textStr, NO, NO )
|
|
// Log results and post post processing time
|
|
NSLogClear
|
|
NSLog( @"%@", frequencyStr )
|
|
NSLog( @"Total unique words in document: %@", fn AppProperty( @"totalWords" ) )
|
|
// Stop timer and log elapsed processing time
|
|
NSLog( @"Elapsed time: %f milliseconds.", ( fn CFAbsoluteTimeGetCurrent - startTime ) * 1000.0 )
|
|
end fn
|
|
|
|
dispatchglobal
|
|
// Pass url for Les Misérables on Project Gutenberg and parse in background
|
|
fn ParseTextFromWebsite( @"https://www.gutenberg.org/files/135/135-0.txt" )
|
|
dispatchend
|
|
|
|
HandleEvents
|