!*************************************************************************************************************
! Protein compare
!
! Threads the amino acid sequence of one protein through the three-dimensional structure of another
!
! Input files:
!
!      Structure file: PDB format. Must have only one chain
!
!      Alignment file: PIR format (as output by Clustal)
!           Leading and lagging blanks are OK
!
!           > [header line for sequence with known structure]
!             [optional blank line]
!             [amino acid sequence, with gaps (-) for alignment]
!           * [optional]
!           > [header line for sequence to be threaded]
!             [optional blank line]
!             [amino acid sequence, with gaps (-) for alignment]
!           * [optional]
!
! Output file:
!
!      New structure file: PDB format
!
!           NO CHANGE: Amino acids common between the two sequences go in chain A
!           INSERTIONS: Amino acids in threaded sequence but not original go in chain B
!                Amino acid called "INS"
!                Inserted atom given meaningless number
!           DELETIONS: Amino acids in original but not in threaded sequence go in chain C
!                Amino acid called "DEL" in first atom of residue (others unchanged)
!           REPLACEMENTS: Amino acids changed in threaded sequence go in chain D
!                Amino acid called new name in first atom of residue (others unchanged)
!
!           Informational lines (e.g. Title, source, etc) copied unchanged
!                therefore information may no longer be appropriate
!
!*************************************************************************************************************

! ******************* LIBRARIES and DECLARATIONS *******************

  LIBRARY "Aminoacid_module.TRU"          ! Contains routines to access info about amino acids
  LIBRARY "Utilities.trc"                 ! Contains Explode
  
  DECLARE FUNCTION GetAlignInfo$          ! Reads alignment file, extracts a sequence from it
  DECLARE FUNCTION UpdateChainID$
  DECLARE FUNCTION updateResidueName$
  DECLARE FUNCTION UpdateResidueSeqNum$
  DECLARE FUNCTION Alignment_comparison$  ! Creates summary of alignment between two sequences
  DECLARE FUNCTION RepStr$
  DECLARE FUNCTION GetSize
  DECLARE FUNCTION AA_translation$        ! Takes 3-letter code to 1-letter code (or reverse)

! ************************ FILE DEFINITIONS ************************

  LET alignmentFile = 1
  LET alignmentFile$ = "seq.pir"

  LET structureFile = 2
  LET structureFile$ = "UDPGDH Streptococcus pyogenes.pdb"

  LET outputFile = 3
  LET outputFile$ = "MesorhizobiumUDPglcDH.pdb"

! *************************** CONSTANTS ****************************

  LET gap$ = "-"
  LET true = 1
  LET false = 0

! **************************** VARIABLES ***************************

! knownSeq$              Amino acid sequence of protein with known structure
! threadedSeq$           Amino acid sequence of protein to be threaded through structure
! sequence_comparison$   Analysis of alignment: M:match, I:insertion, D:deletion, R:replacement
  DIM words$(1)        ! Used by Explode to contain words within line

! ************************** MAIN PROGRAM **************************

  LET knownSeq$ =  GetAlignInfo$(alignmentFile$, 1)
  LET knownSeqHeader$ = header$

  LET threadedSeq$ =  GetAlignInfo$(alignmentFile$, 2)
  LET threadedSeqHeader$ = header$

  CALL PrintSequences

  LET sequence_comparison$ = Alignment_comparison$(knownSeq$, threadedSeq$)
  
  CALL CreateNewStructure

  PRINT "Threading completed"

! ******************* SUBROUTINES and FUNCTIONS ********************


!*************************************************************************************************************
!
! Opens a file containing multiple sequences in FASTA format, return the sequence requested by position
!
!*************************************************************************************************************

SUB PrintSequences

    PRINT "Sequence of protein with known structure:  ";
    PRINT knownSeqHeader$
    PRINT knownSeq$
    PRINT
    PRINT "Sequence of protein to be threaded:  ";
    PRINT threadedSeqHeader$
    PRINT threadedSeq$
    PRINT
END SUB

!*************************************************************************************************************
! GetAlignInfo$ (path to file, number of sequence to extract)
! Opens a file containing multiple sequences in FASTA format, return the sequence requested by position
!
!*************************************************************************************************************

FUNCTION GetAlignInfo$ (path$, seqNum)

    LET seq$ = ""
    OPEN #alignmentFile: name path$

    LET currentSeq = 0

    DO WHILE MORE #alignmentFile

       LINE INPUT #alignmentFile: line$
       LET firstLetter$ = line$[1:1]

       SELECT CASE firstLetter$
          CASE ">"                           ! Marks beginning of sequence
             LET currentSeq = currentSeq + 1
             IF currentSeq = seqNum THEN 
                CALL Explode(line$, words$, " >")
                   ! Splits line into words separated by spaces (put in array words$)
                LET header$ = words$(1)
             END IF
          CASE "*"
          CASE ELSE
             IF currentSeq = seqNum THEN LET seq$ = seq$ & Trim$(line$)
       END SELECT

    LOOP

    LET GetAlignInfo$ = seq$
       
    CLOSE #alignmentFile

END FUNCTION

!*************************************************************************************************************
! Alignment_comparison (known sequence, threaded sequence)
!       Analyze alignment at each position
!           Put I (insertion) where gap in sequence with known structure
!           Put D (delection) where gap in sequence to be threaded
!           Put M (match) where both sequences match
!           Put R (replacement) where sequences differ
!
!*************************************************************************************************************

FUNCTION Alignment_comparison$(knownSequence$, threadedSequence$)

    LOCAL comparison$
    LET comparison$ = ""

    LET seqLen = LEN(knownSequence$)           ! Both sequences must be same length

    FOR i = 1 to seqLen

        IF knownSequence$[i:i] = gap$ THEN
           LET comparison$ = comparison$ & "I"
        ELSE IF threadedSequence$[i:i] = gap$ THEN
           LET comparison$ = comparison$ & "D"
        ELSE IF knownSequence$[i:i] <> threadedSequence$[i:i] THEN
           LET comparison$ = comparison$ & "R"
        ELSE
           LET comparison$ = comparison$ & "M"
        END IF

    NEXT i

    LET Alignment_comparison$ = comparison$

END FUNCTION

!*************************************************************************************************************
! CreateNewStructure
!
!    Copy pdb file and modify the ATOM records according to the edit transcript while copying
!    Changes only the first atom of a residue
!
!*************************************************************************************************************

SUB CreateNewStructure

    LOCAL line$, transcriptPosition, atomPosition, seqPosition, previousResidue$, residue, residueCount, newLine$

    LET residueCount = GetSize(threadedSeq$, 1, len(threadedSeq$))
    
    OPEN #structureFile: NAME structureFile$
    OPEN #outputFile: NAME outputFile$, CREATE NEWOLD
    ERASE #outputFile

!    LET residue = 0

    DO
       LINE INPUT #structureFile: line$

       ! the only lines that matter are ATOM lines.  More specifically, the first ATOM line
       ! for a residue.  position [23:26] of an ATOM line is the residue sequence number

       LET linetype$ = line$[1:4]
       IF linetype$ = "ATOM" THEN
          LET residue = Val(line$[23:26])
          LET choice_for_residue$ = sequence_comparison$[residue:residue]

          SELECT CASE choice_for_residue$
             CASE "M"
                  ! If the residues match, do nothing (copy line)
             CASE "R"
                  ! If a residue is replaced, we update residue's name, chain ID in the record
                  LET line$ = updateResidueName$(threadedSeq$[residue:residue], line$)
                  LET line$ = UpdateChainID$("D", line$)
             CASE "D"
                  ! If a residue is deleted (gap in the sequence it's aligned with)
                  !   then insert a new record with the name of "DEL" on chain C
                  LET newLine$ = line$
                  LET residueCount = residueCount + 1
                  LET newLine$ = updateResidueName$("DEL", newLine$)
                  LET newLine$ = UpdateChainID$("C", newLine$)
                  LET newLine$ = UpdateResidueSeqNum$(USING$("####",residueCount), newLine$)
                  PRINT #outputFile: newLine$
             CASE "I"
                  ! If a residue is inserted in the sequence it's aligned with (gap in current sequence)
                  LET newLine$ = line$
                  LET residueCount = residueCount + 1
                  LET newLine$ = updateResidueName$("INS", newLine$)
                  LET newLine$ = UpdateChainID$("B", newLine$)
                  LET newLine$ = UpdateResidueSeqNum$(USING$("####",residueCount), newLine$)
                  PRINT #outputFile: newLine$
             CASE ELSE
                  PRINT "ERROR: *"; sequence_comparison$[residue:residue];"*"
          END SELECT
       ELSE               ! not ATOM line
          ! don't care about it if it's not an ATOM record
       END IF

       LET previous_linetype$ = linetype$
       PRINT #outputFile: line$
    LOOP UNTIL END #structureFile

    CLOSE #structureFile
    CLOSE #outputFile

END SUB


!*************************************************************************************************************
! GetSize(sequence, start, stop)
!
!    Gives the length of a sequence, excluding gap characters 
!
!	Input:
!		seq$ : sequence to be counted
!		start: position in sequence to start counting
!		stop : position in sequence to stop counting
!
!*************************************************************************************************************
FUNCTION GetSize(seq$, start, stop)

    LOCAL position, skip
    LET position = start
    LET skip = 0

    DO                                          ! Count gaps in sequence
       LET position = POS(seq$, gap$, position)
       IF position > 0 AND position < stop THEN
          LET skip = skip + 1
          LET position = position + 1
       ELSE
          EXIT DO
       END IF
    LOOP

    LET GetSize = (stop - start + 1) - skip
END FUNCTION


!*************************************************************************************************************
! UpdateChainID$ (chainID$, record$)
!
!   Updates the chain ID for the given ATOM record.  
!   ATOM record is fixed width column, chain ID is the 22nd character
!
!	Input:
!		chainID$: new chainID to be written to the record (e.g. A, B, C, D)
!		record$ : ATOM line in pdb format 
!
!*************************************************************************************************************
FUNCTION UpdateChainID$(chainID$, record$)
    IF len(chainID$) <> 1 THEN
       LET UpdateChainID$ = ""
    ELSE
       LET UpdateChainID$ = record$[1:21] & chainID$ & record$[23:len(record$)]
    END IF
END FUNCTION



!*************************************************************************************************************
! UpdateResidueName$ (residueName$, record$)
!
!   Updates the residue name for the given ATOM record.  
!   ATOM record is fixed width column, residue name takes up positions 18th to 20th
!
!	Input:
!		residueName$: residue name to be written to the record
!		record$	    : ATOM line in pdb format 
!
!*************************************************************************************************************
FUNCTION updateResidueName$(residueName$, record$)
    LET length = len(residueName$)

    IF length = 1 THEN
       LET name$ = UCase$(AA_translation$(residueName$))
    ELSE
       LET name$ = residueName$
    END IF

    IF len(name$) <> 3 THEN
       PRINT "aa not found: "; residueName$
       LET updateResidueName$ = ""
    ELSE
       LET updateResidueName$ = record$[1:17] & name$ & record$[21:len(record$)]
    END IF
END FUNCTION

!*************************************************************************************************************
! UpdateResidueSeqNum$( residueSeqNum$, record$ )
! update the residue sequence number for the given ATOM record.  ATOM record is fixed width column, residue 
! sequence number takes up positions 23rd to 26th
!
!	Input:
!		residueSeqNum$: residue sequence number to be written to the record
!		record$	      : ATOM record in pdb format 
!
!*************************************************************************************************************
FUNCTION UpdateResidueSeqNum$(residueSeqNum$, record$)

    LET length = len(residueSeqNum$)
    LET UpdateResidueSeqNum$ = record$[1:22] & residueSeqNum$ & record$[27:len(record$)]

END FUNCTION



END