/*RAPDF*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>

void chomp(char *s) {
    while(*s && *s != '\n' && *s != '\r') s++;
 
    *s = 0;
}

void allocate_array_double(double ***array, int nrows) {
    int i;
    
    *array = malloc(nrows * sizeof(double *));
    if (*array == NULL) {
        printf("Error: out of memory.\n");
        exit(0);
    }
    for (i=0; i<nrows; i++) {
        (*array)[i] = malloc(3 * sizeof(double));
        if ((*array)[i] == NULL) {
            printf("Error: out of memory.\n");
            exit(0);
        }
    }
}

int get_residue_id(char res_code[]) {
        
    if      (strcmp(res_code, "ALA") == 0) {return 0; }
    else if (strcmp(res_code, "CYS") == 0) {return 1; }
    else if (strcmp(res_code, "ASP") == 0) {return 2; }
    else if (strcmp(res_code, "GLU") == 0) {return 3; }
    else if (strcmp(res_code, "PHE") == 0) {return 4; }
    else if (strcmp(res_code, "GLY") == 0) {return 5; }
    else if (strcmp(res_code, "HIS") == 0) {return 6; }
    else if (strcmp(res_code, "ILE") == 0) {return 7; }
    else if (strcmp(res_code, "LYS") == 0) {return 8; }
    else if (strcmp(res_code, "LEU") == 0) {return 9; }
    else if (strcmp(res_code, "MET") == 0) {return 10; }
    else if (strcmp(res_code, "ASN") == 0) {return 11; }
    else if (strcmp(res_code, "PRO") == 0) {return 12; }
    else if (strcmp(res_code, "GLN") == 0) {return 13; }
    else if (strcmp(res_code, "ARG") == 0) {return 14; }
    else if (strcmp(res_code, "SER") == 0) {return 15; }
    else if (strcmp(res_code, "THR") == 0) {return 16; }
    else if (strcmp(res_code, "VAL") == 0) {return 17; }
    else if (strcmp(res_code, "TRP") == 0) {return 18; }
    else if (strcmp(res_code, "TYR") == 0) {return 19; }
    else    {return 100; } 
}

int get_atom_id(char atom_code[]) {
        
    if      (strcmp(atom_code, " N  ") == 0) {return 0; }
    else if (strcmp(atom_code, " CA ") == 0) {return 1; }
    else if (strcmp(atom_code, " C  ") == 0) {return 2; }
    else if (strcmp(atom_code, " O  ") == 0) {return 3; }
    else if (strcmp(atom_code, " CB ") == 0) {return 4; }
    else    {return 100; } 
}

double get_distance(double vector1[3], double vector2[3])
{
    /*Calculate the magnitude of a vector*/
    
    double dist;
    double diff[3];
    int i;
    
    for (i=0; i<3; i++) {
        diff[i] = vector2[i] - vector1[i];
    }    
    
    dist = sqrt(pow(diff[0],2) + pow(diff[1],2) + pow(diff[2],2));
    
    return dist;
}

int get_bin_number(float dist, int max_bin_no) {
    
    int i;
    int bin;
    int bins[21];
    
    for (i=0;i<3;i++) {
        bins[i] = 0;
    }
    
    for (i=3;i<max_bin_no+3;i++) {
        bins[i] = i-2;
    }
    
    for (i=max_bin_no+3;i<21;i++) {
        bins[i] = 100;
    }
        
    bin = bins[(int)(floor(dist))];
    
    return bin;
}
     

/*-----------------------------------------------------------------------------------------------*/
/*-----------------------------------------------------------------------------------------------*/
int main(int argc, char *argv[])

{

    /*DEFINITIONS*/
    char decoy_file_path[400];
    char output_path[400];
    char target_pdb_file[400];
    char countsfile[400];
    char H3chain[10];
    char line[1000];
    char decoy_path[600];
    char *token;
    char decoy_name[21];
    char RMSD_string[21];
    char line_start[6];
    char record_name[7];
    char residue_no_char[5];
    char atom_type[5];
    char x_char[9], y_char[9], z_char[9];
    char residue_type[10];
    char ins_code[5];
    char last_ins_code[5];
    
    int i,j;
    int bin_no;    
    int counts_matrix[100][100][18]; /*atom type 1, atom type 2, bin number*/
    int atom_frequencies[5];
    int atom_frequenciesH3[5];
    int residue_no;
    int N_count=0, Ca_count=0, C_count=0, O_count=0, Cb_count=0;
    int atom_count = 0;
    int atom_countH3 = 0;
    int no_decoys = 0;
    int atom_type_id;
    int added;
    int last_res_no;
    int res_no;
    int pair_frequencies[100][100];
    int bin_frequencies[18];
    int all_counts;
    int atom1_index, atom2_index;
    int res_no1, res_no2;
    int max_distance;
    int max_bin_no;
    int H3start;
    int H3end;
    
    double RAPDF_sum;
    double RAPDF_score;
    double dist;
    double numerator;
    double denominator;
    
    /*Coordinates arrays*/
    double **coordinates;
    double **coordinatesH3;
    int *res_types;
    int *res_typesH3;
    int *atom_types;
    int *atom_typesH3;
    int *H3_res_nos;
    
    FILE *counts_matrix_file; //= fopen("/data/cockatrice/marks/Sphinx-GeneralVersion/GeneralProteinRAPDF/counts_matrix.csv", "r");
    FILE *pdb_file;
    FILE *decoy_list;
    FILE *decoy_file;
    FILE *output_file;
       
               
    /*-------------------------------------------------------------------------------------------*/
    /*Read in arguments*/
    
    if (argc < 15) {
        printf("Error: not enough arguments\n");
        return 0;
    }
    
    for (i = 1; i < argc; i+=2) {
        if (strcmp(argv[i], "-i") == 0) {
            strcpy(decoy_file_path, argv[i+1]);
        }
        else if (strcmp(argv[i], "-o") == 0) {
            strcpy(output_path, argv[i+1]);
        }
        else if (strcmp(argv[i], "-pdb") == 0) {
            strcpy(target_pdb_file, argv[i+1]);
        }
        else if (strcmp(argv[i], "-chain") == 0) {
            strcpy(H3chain, argv[i+1]);
        }
        else if (strcmp(argv[i], "-maxdist") == 0) {
            max_distance = atoi(argv[i+1]);
            max_bin_no = max_distance - 3;
        }
        else if (strcmp(argv[i], "-start") == 0) {
            H3start = atoi(argv[i+1]);
        }
        else if (strcmp(argv[i], "-end") == 0) {
            H3end = atoi(argv[i+1]);
        }
        else if (strcmp(argv[i], "-countsfile") == 0) {
            strcpy(countsfile, argv[i+1]);
        }
        else {
            printf("Error: Unrecognised argument, \"%s\"\n",argv[i]);
            return 0;
        }
    }
    
    i = 0;
    j = 0;
    bin_no = -1;    
    
    
    /*-------------------------------------------------------------------------------------------*/
    /*Load distance counts array from text file*/
    counts_matrix_file = fopen(countsfile, "r");
    
    if (counts_matrix_file != NULL) {
        while (fgets(line, sizeof line, counts_matrix_file) != NULL) {
            if (line[0] == '>') {
                bin_no++;
                i = 0;
                continue;
            }
            chomp(line);
            token = strtok(line, ",");
            j=0;
            while (token != NULL) {            
                counts_matrix[i][j][bin_no] = atoi(token)+1;
                token = strtok(NULL, ",");
                j++;
            }
            i++;
        }
    }
    
    /*Count number of observations for each pair of atom types*/
    for (i=0; i<100; i++) {
        for (j=0; j<100; j++) {
            pair_frequencies[i][j] = 0;
            for (bin_no=0; bin_no<max_bin_no+1; bin_no++) {
                pair_frequencies[i][j] = pair_frequencies[i][j] + counts_matrix[i][j][bin_no];
            }
        }
    }
    
    /*Count number of observations per bin*/
    for (bin_no=0; bin_no<max_bin_no+1; bin_no++) {
        bin_frequencies[bin_no] = 0;
        for (i=0; i<100; i++) {
            for (j=i; j<100; j++) {
                bin_frequencies[bin_no] = bin_frequencies[bin_no] + counts_matrix[i][j][bin_no];
            }
        }
    }
    
    all_counts = 0;
    for (bin_no=0; bin_no<max_bin_no+1; bin_no++) {
        all_counts = all_counts + bin_frequencies[bin_no];
    }
    
    
    /*-------------------------------------------------------------------------------------------*/
    /*Count how many atoms of type C, CA, N, O or CB are present in structure (excluding H3)*/
    pdb_file = fopen(target_pdb_file, "r");
    
    if (pdb_file != NULL){
        while (fgets(line, sizeof line, pdb_file) != NULL){
            //Ignore the residues from the native loop structure and the anchor residue either side
            strncpy(record_name, line, 6);
            record_name[6] = '\0';
            if (line[21] == H3chain[0] && strcmp(record_name,"ATOM  ")==0) {
                strncpy(residue_no_char,line+22,4);
                residue_no_char[4] = '\0';
                residue_no = atoi(residue_no_char);
                
                if (residue_no < H3start || residue_no > H3end) {
                    strncpy(atom_type,line+12,4);
                    atom_type[4] = '\0';
                    
                    atom_type_id = get_atom_id(atom_type);
                    
                    if (atom_type_id < 5) {
                        atom_count++;
                    }
                }
            }
            else if (strcmp(record_name, "ATOM  ")==0) {
                strncpy(atom_type,line+12,4);
                atom_type[4] = '\0';
                
                atom_type_id = get_atom_id(atom_type);
                    
                if (atom_type_id < 5) {
                    atom_count++;
                }
            }
        }
    }
    
    /*Now we know how many atoms we are dealing with we can allocate memory*/
    allocate_array_double(&coordinates, atom_count);  
    res_types = malloc(atom_count * sizeof(int));
    atom_types = malloc(atom_count * sizeof(int));
        

    /*-------------------------------------------------------------------------------------------*/
    /*Extract coordinates from the target file, excluding H3 atoms*/
    pdb_file = fopen(target_pdb_file, "r");
    
    if (pdb_file != NULL){
        added = 0;
        while (fgets(line, sizeof line, pdb_file) != NULL){
            //Ignore the residues from the H3 loop
            strncpy(record_name, line, 6);
            record_name[6] = '\0';
            if (line[21] == H3chain[0] && strcmp(record_name,"ATOM  ")==0) {
                strncpy(residue_no_char,line+22,4);
                residue_no_char[4] = '\0';
                residue_no = atoi(residue_no_char);
                strncpy(residue_type, line+17, 3);
                residue_type[3] = '\0';
                
                if (residue_no < H3start || residue_no > H3end) {
                    strncpy(atom_type,line+12,4);
                    atom_type[4] = '\0';
                    
                    atom_type_id = get_atom_id(atom_type);
                    
                    if (atom_type_id < 5) {
                        coordinates[added][0] = atof(strncpy(x_char, line+30, 8));
                        coordinates[added][1] = atof(strncpy(y_char, line+38, 8));
                        coordinates[added][2] = atof(strncpy(z_char, line+46, 8));
                        res_types[added]      = get_residue_id(residue_type);                  
                        atom_types[added]     = atom_type_id;
                        added++;
                    }
                }
            }
            else if (strcmp(record_name, "ATOM  ")==0) {
                strncpy(atom_type,line+12,4);
                atom_type[4] = '\0';
                
                strncpy(residue_type, line+17, 3);
                residue_type[3] = '\0';
                
                atom_type_id = get_atom_id(atom_type);
                    
                if (atom_type_id < 5) {
                    coordinates[added][0] = atof(strncpy(x_char, line+30, 8));
                    coordinates[added][1] = atof(strncpy(y_char, line+38, 8));
                    coordinates[added][2] = atof(strncpy(z_char, line+46, 8));
                    res_types[added]      = get_residue_id(residue_type);
                    atom_types[added]     = atom_type_id;
                    added++;
                }
            }
        }
    }

    
    /*-------------------------------------------------------------------------------------------*/
    /*Find number of decoys*/
    decoy_file = fopen(decoy_file_path, "r");
    
    
    if (decoy_file != NULL) {
        while (fgets(line, sizeof line, decoy_file) != NULL) {
            strncpy(line_start, line, 5);
            line_start[5] = '\0';
            
            if (strcmp(line_start, "MODEL") == 0) {
                no_decoys++;
            }
        }
    }
    else {
        printf("Could not open decoy list file.\n");
    }

    
    /*-------------------------------------------------------------------------------------------*/
    /*Count number of atoms in the H3 loop using the first decoy file*/
    rewind(decoy_file);
    fgets(line, sizeof line, decoy_file);
    
    if (decoy_file != NULL){
        while (fgets(line, sizeof line, decoy_file) != NULL) {
            strncpy(record_name, line, 6);
            record_name[6] = '\0';
            if (line[21] == H3chain[0] && strcmp(record_name,"ATOM  ")==0) {
                strncpy(residue_no_char,line+22,4);
                residue_no_char[4] = '\0';
                residue_no = atoi(residue_no_char);
                
                //printf("%d, %d\n", H3start, H3end);
                
                if (residue_no > H3start-1 && residue_no < H3end+1) {
                    strncpy(atom_type,line+12,4);
                    atom_type[4] = '\0';
                    atom_type_id = get_atom_id(atom_type);
                    
                    //printf("%d, %d\n", residue_no, atom_type_id);
                    
                    if (atom_type_id < 5) {
                        atom_countH3++;
                    }
                }
            }
            else if (strcmp(record_name, "MODEL ")==0) {
                break;
            }
        }
    }    
        
    allocate_array_double(&coordinatesH3, atom_countH3);
    res_typesH3 = malloc(atom_countH3 * sizeof(int));
    atom_typesH3 = malloc(atom_countH3 * sizeof(int));
    H3_res_nos = malloc(atom_countH3 * sizeof(int));
    
    
    /*-------------------------------------------------------------------------------------------*/
    /*For each decoy in the list; extract H3 coordinates from decoy file, calculate distances*/
    output_file = fopen(output_path, "w");
    fprintf(output_file, "Decoy, RAPDF_score\n");
    
    rewind(decoy_file);
        
    if (decoy_file != NULL) {
        while (fgets(line, sizeof line, decoy_file) != NULL){
            //Ignore the residues from the H3 loop
            strncpy(record_name, line, 6);
            record_name[6] = '\0';
            if (strcmp(record_name, "MODEL ")==0) {
                token = strtok(line, " ");
                token = strtok(NULL, " ");
                
                strncpy(decoy_name, token, 20);
                decoy_name[20] = '\0';
                
//                 token = strtok(NULL, " ");
//                 token = strtok(NULL, " ");
//                 token = strtok(NULL, " ");
//                 
//                 strncpy(RMSD_string, token, 20);
//                 RMSD_string[20] = '\0';
                
                added = 0;
                res_no = 0;
                last_res_no = 0;
                strncpy(last_ins_code, " \0", 2);
            }
            else if (line[21] == H3chain[0] && strcmp(record_name,"ATOM  ")==0) {
                strncpy(residue_no_char,line+22,4);
                residue_no_char[4] = '\0';
                residue_no = atoi(residue_no_char);
                
                if (residue_no > H3start-1 && residue_no < H3end+1) {
                    strncpy(atom_type,line+12,4);
                    atom_type[4] = '\0';
                    
                    atom_type_id = get_atom_id(atom_type);
                        
                    if (atom_type_id < 5) {
                        strncpy(residue_type, line+17, 3);
                        residue_type[3] = '\0';
                        strncpy(ins_code, line+26, 2);
                        ins_code[1] = '\0';
                        
                        if (last_res_no == residue_no && strcmp(ins_code, last_ins_code) == 0) {
                            H3_res_nos[added] = res_no;
                        }
                        else {
                            last_res_no = residue_no;
                            strncpy(last_ins_code, ins_code, 2);
                            last_ins_code[1] = '\0';
                            res_no++;
                            H3_res_nos[added] = res_no;
                        }
                                                        
                        coordinatesH3[added][0] = atof(strncpy(x_char, line+30, 8));
                        coordinatesH3[added][1] = atof(strncpy(y_char, line+38, 8));
                        coordinatesH3[added][2] = atof(strncpy(z_char, line+46, 8));
                        res_typesH3[added]      = get_residue_id(residue_type);
                        atom_typesH3[added]     = atom_type_id;
                        added++;
                    }
                }
            }
            else if (strcmp(record_name, "ENDMDL")==0) {
                RAPDF_sum = 0;
        
                /*Compare H3 atoms to rest of structure*/
                for (i=0; i<atom_countH3; i++) {
                    for (j=0; j<atom_count; j++) {
                              
                        /*Calculate distance*/                                        
                        dist = get_distance(coordinatesH3[i], coordinates[j]);
                                                
                        /*Find which bin*/
                        if (dist < max_distance) {
                            bin_no = get_bin_number(dist, max_bin_no);
                            if (bin_no > 20) {
                                continue;
                            }
                        }
                        else {
                            continue;
                        }
        
                        atom1_index = res_typesH3[i]*5 + atom_typesH3[i];
                        atom2_index = res_types[j]*5 + atom_types[j];
                        
                        numerator = (double)(counts_matrix[atom1_index][atom2_index][bin_no])/(double)(pair_frequencies[atom1_index][atom2_index]);
                        denominator = (double)(bin_frequencies[bin_no])/(double)(all_counts);
                        
                        RAPDF_sum = RAPDF_sum + log(numerator/denominator);
                        
                    }
                }
                
                /*Compare H3 atoms to other H3 atoms (excluding same residue)*/
                for (i=0; i<atom_countH3; i++) {
                    for (j=i+1; j<atom_countH3; j++) {
                        
                        res_no1 = H3_res_nos[i];
                        res_no2 = H3_res_nos[j];
                        
                        if (res_no1 == res_no2) {
                            continue;
                        } 
                              
                        /*Calculate distance*/                                        
                        dist = get_distance(coordinatesH3[i], coordinatesH3[j]);
                        
                        /*Find which bin*/
                        if (dist < max_distance) {
                            bin_no = get_bin_number(dist, max_bin_no);
                            if (bin_no > 20) {
                                continue;
                            }
                        }
                        else {
                            continue;
                        }
        
                        atom1_index = res_typesH3[i]*5 + atom_typesH3[i];
                        atom2_index = res_typesH3[j]*5 + atom_typesH3[j];
                        
                        numerator = (double)(counts_matrix[atom1_index][atom2_index][bin_no])/(double)(pair_frequencies[atom1_index][atom2_index]);
                        denominator = (double)(bin_frequencies[bin_no])/(double)(all_counts);
                        
                        RAPDF_sum = RAPDF_sum + log(numerator/denominator);
                        
                    }
                }
                
                RAPDF_score = -RAPDF_sum;
                
                //fprintf(output_file, "%s, %.3f, %s\n", decoy_name, RAPDF_score, RMSD_string);
                fprintf(output_file, "%s, %.3f\n", decoy_name, RAPDF_score);
            }
        }
    }
    else {
        printf("Unable to open file!\n");
    }
    
    fclose(output_file);
    return(0);
}
