#! /usr/bin/env python3
#
def markov_letters_test ( ):

#*****************************************************************************80
#
## markov_letters_test() tests markov_letters().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    23 February 2026
#
#  Author:
#
#    John Burkardt
#
  import matplotlib
  import numpy as np
  import platform

  print ( '' )
  print ( 'markov_letters_test():' )
  print ( '  matplotlib version: ' + matplotlib.__version__ )
  print ( '  python version:     ' + platform.python_version ( ) )
  print ( '  numpy version:      ' + np.version.version )
  print ( '  Test markov_letters().' )
 
  header = 'abc'
  unigram_frequency_print_test ( header )
  bigram_frequency_print_test ( header )

  header = 'quick_brown_fox'
  unigram_frequency_print_test ( header )
  bigram_frequency_print_test ( header )

  header = 'panjandrum'
  unigram_frequency_print_test ( header )
  unigram_frequency_print_sorted_test ( header )
  unigram_frequency_plot_test ( header )
  bigram_frequency_print_test ( header )
  bigram_probability_print ( header )

  header = 'alice_in_wonderland'
  unigram_frequency_print_test ( header )
  unigram_frequency_print_sorted_test ( header )
  unigram_frequency_plot_test ( header )
  unigram_frequency_plot_sorted_test ( header )
  bigram_probability_print ( header )

  header = 'die_verwandlung'
  unigram_frequency_print_test ( header )
  unigram_frequency_print_sorted_test ( header )
  unigram_frequency_plot_test ( header )
  unigram_frequency_plot_sorted_test ( header )
  bigram_frequency_print_test ( header )
  bigram_probability_print ( header )
#
#  Terminate.
#
  print ( '' )
  print ( 'markov_letters_test():' )
  print ( '  Normal end of execution.' )

  return

def bigram_frequency_print_test ( header ):

#*****************************************************************************80
#
## bigram_frequency_print_test() tests bigram_frequency().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
  print ( '' )
  print ( 'bigram_frequency_print_test():' )
  print ( '  bigram analysis of "' + header + '.txt"' )

  c = unigram_chars ( )
  freq = bigram_frequency ( header )
  bigram_frequency_print ( c, freq, header )

  return

def bigram_frequency ( header ):

#*****************************************************************************80
#
## bigram_frequency() counts letter-pair frequencies in a file.
#
#  Discussion:
#
#    A sequence of two letters is known as a "bigram".
#
#    Capital and lower case letters are merged.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    19 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
#  Output:
#
#    integer FREQ(26,26): counts the number of times character i
#    was immediately followed by character j.
#
  import numpy as np
  import re
#
#  Read the data.
#
  filename = header + '.txt'
  input = open ( filename )
  data = input.read ( )
  input.close ( )
#
#  Clean the data.
#
  data = data.lower ( )
  data = re.sub ( 'r[^a-z]+', ' ', data )
#
#  Count the pairs.
#
  freq = np.zeros ( [ 26, 26 ], dtype = int )

  i1 = -1
  for c in data:
    i2 = char_to_int ( c )
    if ( 0 <= i2 and i2 < 26 ):
      if ( 0 <= i1 and i1 < 26 ):
        freq[i1,i2] = freq[i1,i2] + 1
    i1 = i2

  return freq

def bigram_frequency_print ( c, freq, header ):

#*****************************************************************************80
#
## bigram_frequency_print() prints the letter-pair frequency table.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string C(26): the characters.
#
#    integer FREQ(26,26): the number of times character i
#    was immediately followed by character j.
#
#    string header: an identifier for the data.
#
  import numpy as np

  filename = header + '.txt'

  s = np.sum ( freq, dtype = int )

  print ( '' )
  print ( '  ' + filename )
  print ( '' )

  print ( '   ', end = '' )
  for j in range ( 0, len ( c ) ):
    print ( '  %c' % ( c[j] ), end = '' )
  print ( '' )
  print ( '' )
  for i in range ( 0, len ( c ) ):
    print ( '  %c' % ( c[i] ), end = '' )
    for j in range ( 0, len ( c ) ):
      print ( '%3d' % ( freq[i,j] ), end = '' )
    print ( '' )

  return

def bigram_probability_print ( header ):

#*****************************************************************************80
#
## bigram_probability_print() prints the letter-pair probability table.
#
#  Discussion:
#
#    Each row of the frequency table is divided by its sum and multiplied 
#    by 100.  (Percentages are easier to read than true probabilities.)
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    24 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
  import numpy as np

  c = unigram_chars ( )
  freq = bigram_frequency ( header )

  print ( '' )
  print ( 'bigram_probability_print():' )
  print ( '  Analysis of "' + header + '.txt"' )
  print ( '' )

  for j in range ( 0, len ( c ) ):
    print ( '  %c' % ( c[j] ), end = '' )
  print ( '' )
  print ( '' )
  for i in range ( 0, len ( c ) ):
    print ( '  %c' % ( c[i] ), end = '' )
    s2 = np.sum ( freq[i,:] )
    if ( s2 != 0 ):
      freq[i,:] = 100.0 * freq[i,:] / s2
    for j in range ( 0, len ( c ) ):
      print ( '%3.0f' % ( freq[i,j] ), end = '' )
    print ( '' )

  return

def char_to_int ( c ):

#*****************************************************************************80
#
## char_to_int() converts a lowercase character to an integer.
#
#  Discussion:
#
#    a     0
#    b     1
#    ... ...
#    z    25
#    ?    26  (any character not a-z)
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    15 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    character C: the character.
#
#  Output:
#
#    integer i: the index.
#
  if ( ord ( c ) < ord ( 'a' ) or ord ( 'z' ) < ord ( c ) ):
    i = 26
  else:
    i = ord ( c ) - ord ( 'a' )

  return i

def timestamp ( ):

#*****************************************************************************80
#
## timestamp() prints the date as a timestamp.
#
#  Licensing:
#
#    This code is distributed under the MIT license. 
#
#  Modified:
#
#    15 February 2026
#
#  Author:
#
#    John Burkardt
#
  import time

  t = time.time ( )
  print ( time.ctime ( t ) )

  return

def unigram_chars ( ):

#*****************************************************************************80
#
## unigram_chars() returns the unigram values as a vector.
#
#  Discussion:
#
#    This is simply an array of the alphabetic characters, and a space.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Output:
#
#    char c[26]: the unigram values.
#
  import numpy as np

  s = "abcdefghijklmnopqrstuvwxyz"

  c = np.zeros ( len ( s ), dtype = str )
  for i in range ( 0, len ( c ) ):
    c[i] = s[i]

  return c

def unigram_frequency ( header ):

#*****************************************************************************80
#
## unigram_frequency() counts letter frequencies in a file.
#
#  Discussion:
#
#    An instance of a single letter is known as a "unigram".
#
#    Capital and lower case letters are merged.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    19 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
#  Output:
#
#    integer freq(26): counts the number of occurrences of character i.
#
  import numpy as np
  import re
#
#  Get the data.
#
  filename = header + '.txt'
  input = open ( filename )
  data = input.read ( )
  input.close ( )
#
#  Clean the data.
#
  data = data.lower ( )
  data = re.sub ( 'r[^a-z]+', ' ', data )
#
#  Count the data.
#
  c = unigram_chars ( )
  freq = np.zeros ( len ( c ), dtype = int )
  for i in range ( len ( c ) ):
    freq[i] = data.count ( c[i] )

  return freq

def unigram_frequency_plot ( c, freq, header ):

#*****************************************************************************80
#
## unigram_frequency_plot() plots a letter frequency table.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    15 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    integer C(26): the unigrams.
#
#    integer FREQ(26): the number of occurences of character i.
#
#    string header: an identifier for the data.
#
  import matplotlib.pyplot as plt
  import numpy as np

# c = unigram_chars ( )
  locations = np.arange ( 0, len ( c ) )

  plt.clf ( )
  plt.bar ( locations, freq )
  plt.grid ( True )
  plt.xlabel ( '<-- Character -->' )
  plt.ylabel ( '<-- Frequency -->' )
  plt.xticks ( ticks = locations, labels = c )
  plt.title ( header )

  filename = header + '_frequency.png'
  plt.savefig ( filename )
  print ( '  Graphics saved as "' + filename + '"' )

  return

def unigram_frequency_plot_test ( header ):

#*****************************************************************************80
#
## unigram_frequency_plot_test() tests unigram_frequency_plot().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
  print ( '' )
  print ( 'unigram_frequency_plot_test():' )
  print ( '  Plot unigram frequency of "' + header + '.txt"' )

  freq = unigram_frequency ( header )
  c = unigram_chars ( )
  unigram_frequency_plot ( c, freq, header )

  return

def unigram_frequency_plot_sorted_test ( header ):

#*****************************************************************************80
#
## unigram_frequency_plot_sorted_test() tests unigram_frequency_plot_sorted().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
  import numpy as np

  print ( '' )
  print ( 'unigram_frequency_plot_test():' )
  print ( '  Plot sorted unigram frequency of "' + header + '.txt"' )

  freq = unigram_frequency ( header )
  index = np.argsort ( freq )
  index = np.flip ( index )
  c = unigram_chars ( )

  c_sorted = c[index]
  freq_sorted = freq[index]
  header_sorted = header + '_sorted'
  unigram_frequency_plot ( c_sorted, freq_sorted, header_sorted )

  return

def unigram_frequency_print ( c, freq, header ):

#*****************************************************************************80
#
## unigram_frequency_print() prints a letter frequency table.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    19 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    integer C(26): the unigrams.
#
#    integer FREQ(26): the number of occurences of character i.
#
#    string header: an identifier for the data.
#
  import numpy as np

  s = np.sum ( freq )

  print ( '' )
  print ( '  ' + header )
  print ( '' )
  for i in range ( len ( c ) ):
    print ( '  ', c[i], '  ', freq[i] )

  return

def unigram_frequency_print_test ( header ):

#*****************************************************************************80
#
## unigram_frequency_print_test() tests unigram_frequency().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
  print ( '' )
  print ( 'unigram_frequency_print_test():' )
  print ( '  unigram analysis of "' + header + '.txt"' )

  freq = unigram_frequency ( header )
  c = unigram_chars ( )
  unigram_frequency_print ( c, freq, header )

  return

def unigram_frequency_print_sorted_test ( header ):

#*****************************************************************************80
#
## unigram_frequency_print_sorted_test() tests unigram_frequency_print_sorted().
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    22 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Input:
#
#    string header: an identifier for the data.
#
  import numpy as np

  print ( '' )
  print ( 'unigram_frequency_print_sorted_test():' )
  print ( '  unigram analysis of "' + header + '.txt"' )

  freq = unigram_frequency ( header )
  index = np.argsort ( freq )
  index = np.flip ( index )
  c = unigram_chars ( )

  c_sorted = c[index]
  freq_sorted = freq[index]
  header_sorted = header + '_sorted'
  unigram_frequency_print ( c_sorted, freq_sorted, header_sorted )

  return

def unigram_string ( ):

#*****************************************************************************80
#
## unigram_string() returns the unigram values as a string.
#
#  Discussion:
#
#    This is simply an array of the alphabetic characters, and a space.
#
#  Licensing:
#
#    This code is distributed under the MIT license.
#
#  Modified:
#
#    15 February 2026
#
#  Author:
#
#    John Burkardt
#
#  Output:
#
#    string S(26): the unigram values.
#
  s = "abcdefghijklmnopqrstuvwxyz"

  return s

if ( __name__ == '__main__' ):
  timestamp ( )
  markov_letters_test ( )
  timestamp ( )