#! /usr/bin/env python3
#
class ngram_score ( object ):

#*****************************************************************************80
#
## ngram_score() is a class for the ngram scoring program.
#
#  Modified:
#
#    30 August 2021
#
#  Author:
#
#    Unknown
#
  def __init__( self, ngramfile, sep = ' ' ):
    ''' load a file containing ngrams and counts, calculate log probabilities '''

    import numpy as np

    self.ngrams = {}
    fh = open ( ngramfile, 'rt' )
    for line in fh:
      key, count = line.split ( sep ) 
      self.ngrams[key] = int ( count )
    self.L = len ( key )
    self.N = sum ( self.ngrams.values() )
#
#  Calculate log probabilities.
#
    for key in self.ngrams.keys():
      self.ngrams[key] = np.log10 ( float ( self.ngrams[key]) / self.N )
    self.floor = np.log10 ( 0.01 / self.N )

  def score ( self, text ):
    ''' compute the score of text '''
    score = 0
    ngrams = self.ngrams.__getitem__
    for i in range ( len ( text ) - self.L + 1 ):
      if text[i:i+self.L] in self.ngrams: 
        score += ngrams ( text[i:i+self.L] )
      else:
        score += self.floor          
    return score

def ngram_score_test1 ( ):

#*****************************************************************************80
#
## ngram_score_test1() tests text against monogram statistics.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    15 February 2016
#
#  Author:
#
#    John Burkardt
#
  import platform

  print ( '' )
  print ( 'ngram_score_test1():' )
  print ( '  Python version: %s' % ( platform.python_version ( ) ) )
  print ( '  ngram_score() tests a string or text against English ngram statistics.' )
  print ( '  Here we do a test against English monograms.' )
  print ( '' )
  print ( '  Apparently, you want to remove all nonalphabetic information,' )
  print ( '  and uppercase your text.  But you may wish to preserve spaces.' )
  print ( '' )

  fitness = ngram_score ( '../../datasets/ngrams/english_monograms.txt' )
#
#  Notice that NGRAM_SCORE is affected by the case of the text,
#  by spaces, and by punctuation.
# 
  s = 'HELLOWORLD'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'HELLO WORLD'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'helloworld'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'HELLO, WORLD!'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'Hello, world!'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )
#
#  Read text from a file.
#  Oddly enough, HELLOWORLD read from a file gives a different
#  score from HELLOWORLD entered as a string.  It seems to have
#  a terminating character.
#
  filename = 'HELLOWORLD.txt'
  file = open ( filename, 'r' )
  t = file.read ( )
  t = str.upper ( t )
  score = fitness.score ( t  )
  print ( '  %s length = %d, score = %g' % ( filename, len ( t ), score ) )
  file.close ( )
#
#  Read text from a file.
#
  filename = 'desiderata.txt'
  file = open ( filename, 'r' )
  t = file.read ( )
  t = str.upper ( t )
  score = fitness.score ( t  )
  print ( '  %s length = %d, score = %g' % ( filename, len ( t ), score ) )
  file.close ( )
#
#  Compare results for a file of the same length,
#  after rot13 operation.  Score should be much worse.
#
  filename = 'qrfvqrengn.gkg'
  file = open ( filename, 'r' )
  t = file.read ( )
  t = str.upper ( t )
  score = fitness.score ( t  )
  print ( '  %s length = %d, score = %g' % ( filename, len ( t ), score ) )
  file.close ( )
#
#  Terminate.
#
  print ( '' )
  print ( 'ngram_score_test1():' )
  print ( '  Normal end of execution.' )
  return

def ngram_score_test2 ( ):

#*****************************************************************************80
#
## ngram_score_test2() tests text against bigram statistics.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    15 February 2016
#
#  Author:
#
#    John Burkardt
#
  import platform

  print ( '' )
  print ( 'ngram_score_test2():' )
  print ( '  Python version: %s' % ( platform.python_version ( ) ) )
  print ( '  ngram_score() tests a string or text against English ngram statistics.' )
  print ( '  Here we do a test against English bigrams.' )
  print ( '' )
  print ( '  Apparently, you want to remove all nonalphabetic information,' )
  print ( '  and uppercase your text.  But you may wish to preserve spaces.' )
  print ( '' )

  fitness = ngram_score ( '../../datasets/ngrams/english_bigrams.txt' )
#
#  Notice that NGRAM_SCORE is affected by the case of the text,
#  by spaces, and by punctuation.
# 
  s = 'HELLOWORLD'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'HELLO WORLD'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'helloworld'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'HELLO, WORLD!'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )

  s = 'Hello, world!'
  score = fitness.score ( s )
  print ( '  %s length = %d, score = %g' % ( s, len ( s ), score ) )
#
#  Read text from the file "HELLOWORLD.txt".
#  Oddly enough, HELLOWORLD read from a file gives a different
#  score from HELLOWORLD entered as a string.  It seems to have
#  a terminating character.
#
  filename = 'HELLOWORLD.txt'
  file = open ( filename, 'r' )
  t = file.read ( )
  t = str.upper ( t )
  score = fitness.score ( t  )
  print ( '  %s length = %d, score = %g' % ( filename, len ( t ), score ) )
  file.close ( )
#
#  Read text from the file "desiderata.txt".
#
  filename = 'desiderata.txt'
  file = open ( filename, 'r' )
  t = file.read ( )
  t = str.upper ( t )
  score = fitness.score ( t  )
  print ( '  %s length = %d, score = %g' % ( filename, len ( t ), score ) )
  file.close ( )
#
#  Compare results for a file of the same length,
#  after rot13 operation.  Score should be much worse.
#
  filename = 'qrfvqrengn.gkg'
  file = open ( filename, 'r' )
  t = file.read ( )
  t = str.upper ( t )
  score = fitness.score ( t  )
  print ( '  %s length = %d, score = %g' % ( filename, len ( t ), score ) )
  file.close ( )
#
#  Terminate.
#
  print ( '' )
  print ( 'ngram_score_test2():' )
  print ( '  Normal end of execution.' )
  return

def ngrams_test ( ):

#*****************************************************************************80
#
## ngrams_test() tests ngrams().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    23 March 2021
#
#  Author:
#
#    John Burkardt
#
  import platform

  print ( '' )
  print ( 'ngrams_test():' )
  print ( '  Python version: %s' % ( platform.python_version ( ) ) )
  print ( '  Test ngrams()' )

  ngram_score_test1 ( )
  ngram_score_test2 ( )
#
#  Terminate.
#
  print ( '' )
  print ( 'ngrams_test():' )
  print ( '  Normal end of execution.' )
  return

def timestamp ( ):

#*****************************************************************************80
#
## timestamp() prints the date as a timestamp.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    06 April 2013
#
#  Author:
#
#    John Burkardt
#
  import time

  t = time.time ( )
  print ( time.ctime ( t ) )

  return None

if ( __name__ == '__main__' ):
  timestamp ( )
  ngrams_test ( )
  timestamp ( )