ProteoMaker/R/01_GenerateGroundTruth.R at master · computproteomics/ProteoMaker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
################################################################################
#                  GENERATE PROTEOFORM-CENTRIC GROUND TRUTH                    #
################################################################################

#' Import and Process Protein Sequence Data for Simulation
#'
#' This function imports a single protein sequence FASTA file and optionally a text file containing selected
#' protein accessions. It processes the protein sequences by filtering out invalid entries, separating them into
#' modified and unmodified sets, and validating the input files.
#'
#' @param parameters A list containing the following elements:
#' \describe{
#'   \item{PathToFasta}{A character string specifying the path to the FASTA file containing protein sequences.}
#'   \item{PathToProteinList}{A character string specifying the path to the text file containing a list of selected
#'   protein accessions. If `NA`, a fraction of proteins will be selected randomly for modification.}
#'   \item{FracModProt}{A numeric value between 0 and 1 indicating the fraction of proteins to be modified.
#'   Only applicable if `PathToProteinList` is `NA`.}
#'   \item{ModifiableResidues}{A list of characters indicating the residues that can be modified in the
#'   protein sequences.}
#' }
#'
#' @return A list with two elements:
#' \describe{
#'   \item{to.Modify}{A data frame containing the protein sequences selected for modification, along with their
#'   corresponding metadata. If no proteins are selected for modification, this will be `NULL`.}
#'   \item{to.be.Unmodified}{A data frame containing the protein sequences that remain unmodified, along
#'   with their corresponding metadata. If no proteins remain unmodified, this will be `NULL`.}
#' }
#'
#' @details
#' The function performs the following steps:
#' \enumerate{
#'   \item Imports and validates the FASTA file.
#'   \item Filters out protein sequences containing unusual amino acids.
#'   \item Removes duplicate protein accessions.
#'   \item Fractionates the protein set into modified and unmodified fractions, based on either a specified
#'   percentage or a list of selected protein accessions.
#'   \item If the protein set is too small to be fractionated, it returns only an unmodified set.
#'   \item If the input files are corrupted or the protein set is empty after filtering, it returns `NULL` for
#'   both modified and unmodified sets.
#' }
#'
#' @keywords internal
#'
#'
proteinInput <- function(parameters) {
  message(" + Importing data:")
  # Check is the fasta file can be loaded.
  error <- try(protr::readFASTA(file = parameters$PathToFasta, legacy.mode = TRUE, seqonly = FALSE), silent = TRUE)

  if (!inherits(error, "try-error")) {
    # Read fasta.
    fasta <- protr::readFASTA(file = parameters$PathToFasta, legacy.mode = TRUE, seqonly = FALSE)
    fasta <- data.frame(
      Sequence = unlist(fasta), Accession = sub(".*[|]([^.]+)[|].*", "\\1", names(fasta)),
      stringsAsFactors = FALSE
    )
    rownames(fasta) <- seq_len(nrow(fasta))
    message("  - File ", parameters$PathToFasta, " imported, containing ", nrow(fasta), " protein sequences.")

    # Filter non-expressed proteins (random)
    if (parameters$PercExpressedProt < 1) {
      to.keep <- round(parameters$PercExpressedProt * nrow(fasta))
      fasta <- fasta[sample(seq_len(nrow(fasta)), size = to.keep), ]
      message("  - A total of ", nrow(fasta), " protein sequences have been randomly selected to be expressed.")
    }

    # Filter proteins carrying unusual amino acids.
    knownAA <- c("A", "L", "R", "K", "N", "M", "D", "F", "C", "P", "E", "S", "Q", "T", "G", "W", "H", "Y", "I", "V")
    unknownAA <- setdiff(LETTERS, knownAA)
    initialRows <- nrow(fasta)
    fasta <- fasta[if (initialRows > 1) {
      rowSums(sapply(unknownAA, grepl, x = fasta$Sequence)) == 0
    } else {
      sum(sapply(unknownAA, grepl, x = fasta$Sequence)) == 0
    }, ]
    message(
      "  - A total of ", initialRows - nrow(fasta),
      " protein sequences have been removed due to unusual amino acids ",
      paste0("(", paste0(unknownAA, collapse = ","), ")"), " composition."
    )

    # Remove duplicated protein accessions.
    message(
      "  - A total of ", length(which(duplicated(fasta$Accession))),
      " duplicated protein accessions have been removed."
    )
    fasta <- fasta[!duplicated(fasta$Accession), ]
    message("  - Total number of remaining protein sequences: ", nrow(fasta), "\n")

    # Proceed unless fasta df is empty after filtering.
    if (nrow(fasta) > 0) {
      message(" + Creating modified and unmodified fractions:")

      # Returns a list of the number of modifiable residues on all sequences, for each residue of ModifiableResidues.
      possible.modifiable.AAs <- as.data.frame(sapply(fasta$Sequence, function(x) {
        string <- strsplit(x, split = "")
        return(sapply(unlist(parameters$ModifiableResidues), function(y) length(which(string[[1]] == y))))
      }))

      rownames(possible.modifiable.AAs) <- seq_len(nrow(possible.modifiable.AAs))

      # Add an additional column, denoting the number of ModifiableResidues residue types found for each protein.
      possible.modifiable.AAs$Modifiable <- sapply(seq_len(nrow(possible.modifiable.AAs)), function(x) {
        length(which(possible.modifiable.AAs[x, ] > 0))
      })

      # Proteins that contain at least one of the ModifiableResidues.
      unmodifiable.indices <- which(possible.modifiable.AAs$Modifiable == 0)

      # Proteins that contain at least a residue of ModifiableResidues, and thus are modifiable.
      modifiable.indices <- setdiff(seq_len(nrow(fasta)), unmodifiable.indices)

      message("  - A total of ", length(unmodifiable.indices), " sequences are unmodifiable.")

      # If there is not a specific protein list to be modified.
      if (is.na(parameters$PathToProteinList)) {
        # If the set of proteins can be fractionated. This covers the case when parameters$FracModProt = 0
        if ((parameters$FracModProt * length(modifiable.indices)) >= 1) {
          # Randomly select a FracModProt % of modifiable proteins (modifiable.indices).
          to.modify.indices <- sample(modifiable.indices, size = parameters$FracModProt * length(modifiable.indices))
          to.Modify <- fasta[to.modify.indices, ]
          rownames(to.Modify) <- seq_len(nrow(to.Modify))

          message(
            "  - A total of ", length(modifiable.indices), " sequences are modifiable, from which ",
            parameters$FracModProt * 100, "% randomly selected to be modified."
          )

          to.be.Unmodified <- fasta[-to.modify.indices, ]

          # Add additional columns to the unmodified fraction df.
          to.be.Unmodified$PTMPos <- vector(mode = "list", length = nrow(to.be.Unmodified))
          to.be.Unmodified$PTMType <- vector(mode = "list", length = nrow(to.be.Unmodified))

          message("  - Modified fraction: ", nrow(to.Modify), " proteins.")
          message("  - Unmodified fraction: ", nrow(to.be.Unmodified), " proteins.")

          # Empty dfs replaced with NULL.This covers the case when parameters$FracModProt = 1
          if (nrow(to.be.Unmodified) > 0) {
            rownames(to.be.Unmodified) <- seq_len(nrow(to.be.Unmodified))
          } else {
            to.be.Unmodified <- NULL
          }

          return(list(to.Modify = to.Modify, to.be.Unmodified = to.be.Unmodified))
        } else {
          to.be.Unmodified <- fasta
          to.be.Unmodified$PTMPos <- vector(mode = "list", length = nrow(to.be.Unmodified))
          to.be.Unmodified$PTMType <- vector(mode = "list", length = nrow(to.be.Unmodified))

          message("  - The protein set cannot be fractionated, too low number of proteins or too low FracModProt %.")
          message("  - Modified fraction: 0 proteins.")
          message("  - Unmodified fraction: ", nrow(to.be.Unmodified), " proteins.\n")

          return(list(to.Modify = NULL, to.be.Unmodified = to.be.Unmodified))
        }
      } else { # If there is a specific protein list to be modified.

        # Check is the protein list file can be loaded.
        error <- try(read.csv(parameters$PathToProteinList, header = FALSE, stringsAsFactors = FALSE), silent = TRUE)

        if (!inherits(error, "try-error")) {
          protein.list.input <- unique(as.vector(read.csv(parameters$PathToProteinList,
            header = FALSE,
            stringsAsFactors = FALSE
          )[, 1]))
          message(
            "  - Protein list file ", parameters$PathToProteinList, " loaded and contains ",
            length(protein.list.input), " unique protein accessions."
          )

          mapping <- unlist(lapply(protein.list.input, function(x) which(fasta$Accession == x)))
          to.modify.indices <- intersect(mapping, modifiable.indices)

          if (length(to.modify.indices) > 0) {
            message(
              " - A total of ", length(mapping), " protein accessions have found in fasta file from which ",
              length(to.modify.indices), " are modifiable."
            )
            to.Modify <- fasta[to.modify.indices, ]
            rownames(to.Modify) <- seq_len(nrow(to.Modify))
            to.be.Unmodified <- fasta[-to.modify.indices, ]

            # Add additional columns to the unmodified fraction df.
            to.be.Unmodified$PTMPos <- vector(mode = "list", length = nrow(to.be.Unmodified))
            to.be.Unmodified$PTMType <- vector(mode = "list", length = nrow(to.be.Unmodified))

            message("  - Modified fraction: ", nrow(to.Modify), " proteins.")
            message("  - Unmodified fraction: ", nrow(to.be.Unmodified), " proteins.\n")

            if (nrow(to.be.Unmodified) > 0) {
              rownames(to.be.Unmodified) <- seq_len(nrow(to.be.Unmodified))
            } else {
              to.be.Unmodified <- NULL
            }

            return(list(to.Modify = to.Modify, to.be.Unmodified = to.be.Unmodified))
          } else {
            to.be.Unmodified <- fasta
            to.be.Unmodified$PTMPos <- vector(mode = "list", length = nrow(to.be.Unmodified))
            to.be.Unmodified$PTMType <- vector(mode = "list", length = nrow(to.be.Unmodified))
            message(
              "  - Proteins in ", parameters$PathToProteinList, " are not modifiable or are not found in ",
              parameters$PathToFasta, "."
            )
            message("  - Modified fraction: 0 proteins.")
            message("  - Unmodified fraction: ", nrow(to.be.Unmodified), " proteins.\n")

            return(list(to.Modify = NULL, to.be.Unmodified = to.be.Unmodified))
          }
        } else {
          message(crayon::red("  - Protein list file ", parameters$PathToProteinList, " couldn't be loaded!\n"))
          return(list(to.Modify = NULL, to.be.Unmodified = NULL))
        }
      }
    } else {
      message(crayon::red("  - There are no protein sequences left!\n"))
      return(list(to.Modify = NULL, to.be.Unmodified = NULL))
    }
  } else {
    message(crayon::red("  - Fasta file ", parameters$PathToFasta, " couldn't be loaded!\n"))
    return(list(to.Modify = NULL, to.be.Unmodified = NULL))
  }
}
#####################

#' Perform Modifications on Protein Sequences
#'
#' This function performs modifications on a fraction of protein sequences selected by the `proteinInput` function.
#' It generates proteoforms based on the provided parameters, applies modifications, and optionally retains some
#' sequences in their unmodified form.
#'
#' @param to.Modify A data frame containing the protein sequences that have been selected for modification.
#' This is typically the output from the `proteinInput` function.
#' @param parameters A list containing the following elements:
#' \describe{
#'   \item{PTMTypes}{A character vector specifying the types of post-translational modifications (PTMs) to be applied.}
#'   \item{PTMTypesDistr}{A numeric vector specifying the background frequency distribution of each PTM type.}
#'   \item{ModifiableResidues}{A list of character vectors specifying the residues that can be modified
#'   for each PTM type.}
#'   \item{ModifiableResiduesDistr}{A list of numeric vectors specifying the background frequency distribution of
#'   each residue type for modification.}
#'   \item{PropModPerProt}{A numeric value indicating the proportion of proteoforms to generate per protein sequence.}
#'   \item{RemoveNonModFormFrac}{A numeric value indicating the fraction of modified sequences that should retain
#'   their unmodified counterparts.}
#' }
#'
#' @return A list with two elements:
#' \describe{
#'   \item{mod.proteoforms}{A data frame containing the modified proteoforms, including details of the modification
#'   positions and types.}
#'   \item{unmod.proteoforms}{A data frame containing the unmodified counterparts of the selected protein sequences,
#'   if any. If no sequences are retained unmodified, this will be `NULL`.}
#' }
#'
#' @details
#' The function performs the following steps:
#' \enumerate{
#'   \item Selects a proportion of the input sequences to generate multiple proteoforms per sequence.
#'   \item Applies the specified modifications to the selected proteoforms.
#' \item Summarizes the types and residues of modifications applied to each proteoform.
#'   \item Optionally retains a fraction of the input sequences in their unmodified form, based on the specified
#'   parameters.
#' }
#'
#' @keywords internal
#'
performModification <- function(to.Modify, parameters) {
  ptmtypes <- unlist(parameters$PTMTypes)
  ptmtypes <- ptmtypes[!is.na(ptmtypes)]

  if (length(ptmtypes) > 0) {
    message(" + Performing modification:")
    message(
      "  - Selected modification type(s) ", paste0('"', paste0(ptmtypes, collapse = '", "'), '"'),
      " with background frequency distribution of ", paste0(paste0(unlist(parameters$PTMTypesDistr) * 100, "%"),
        collapse = ", "
      ), " respectively."
    )


    for (i in ptmtypes) {
      message(
        "  - For modification ", paste0('"', i, '"'), " residue(s) ", paste0(parameters$ModifiableResidues[[1]][[i]],
          collapse = ", "
        ),
        " can be modified with background frequency distribution of ",
        paste0(paste0(parameters$ModifiableResiduesDistr[[1]][[i]] * 100, "%"), collapse = ", "), " respectively."
      )
    }

    # All proteins in to.Modify set are selected to be modified at least once.
    # Then randomly a set of proteoforms from the imported to.Modify set is selected based on the fraction multiplier
    # PropModPerProt - 1. The size of mod.proteoforms set depends to PropModPerProt. (When 1 a proteoform set of size
    # equal to to.Modify is created, when 2 the size is double etc etc)
    mod.proteoforms <- to.Modify

    if (parameters$PropModPerProt >= 2) {
      mod.proteoforms <- rbind(
        mod.proteoforms,
        to.Modify[sample(
          x = seq_len(nrow(to.Modify)),
          size = (parameters$PropModPerProt - 1) *
            nrow(to.Modify), replace = TRUE
        ), ]
      )
      mod.proteoforms <- mod.proteoforms[order(mod.proteoforms$Accession), ]
    }

    # Create additional columns for modification positions and modification type.
    mod.proteoforms$PTMPos <- vector(mode = "list", length = nrow(mod.proteoforms))
    mod.proteoforms$PTMType <- vector(mode = "list", length = nrow(mod.proteoforms))

    # Modify the selected proteoforms.
    selected.modifications <- modify(mod.proteoforms$Sequence, parameters)
    # selected.modifications <- as.data.frame(do.call(rbind, selected.modifications))

    # Fill the columns.
    mod.proteoforms$PTMPos <- selected.modifications$Positions
    mod.proteoforms$PTMType <- selected.modifications$Types

    # Summarize modification types and residues modified by each type.
    Type_Count <- AA_Count <- list()
    for (ptm in ptmtypes) {
      AA_Count[[ptm]] <- list()
      for (res in parameters$ModifiableResidues[[1]][[ptm]]) {
        AA_Count[[ptm]][[res]] <- sum(unlist(sapply(selected.modifications$Count, function(x) x[[ptm]][[res]])))
      }
      Type_Count[[ptm]] <- sum(unlist(AA_Count[[ptm]]))
    }
    # print(count.per.AAs)

    message("  - Sequences modified!")

    for (ptm in ptmtypes) {
      message(
        "  - For modification ", paste0('"', ptm, '"'), " and residue(s) ",
        paste0(parameters$ModifiableResidues[[1]][[ptm]], collapse = ", "),
        " the resulting relative frequency distribution is ",
        paste0(paste0(round(100 * unlist(AA_Count[[ptm]]) / sum(unlist(AA_Count[[ptm]])), 3), "%"), collapse = ", "),
        " respectively."
      )
    }

    message(
      "  - The resulting relative frequency distribution for modification type(s) ",
      paste0('"', paste0(ptmtypes, collapse = '", "'), '"'),
      " is ", paste0(paste0(round(100 * unlist(Type_Count) / sum(unlist(Type_Count)), 3), "%"), collapse = ", "),
      " respectively."
    )

    # Select a  fraction of selected to-modify sequences, to remain unmodified too.
    unmodified.proteoforms.indices <- sample(seq_len(nrow(to.Modify)),
      size = (1 - parameters$RemoveNonModFormFrac) * nrow(to.Modify)
    )
    unmod.proteoforms <- to.Modify[unmodified.proteoforms.indices, ]
    unmod.proteoforms$PTMPos <- vector(mode = "list", length = nrow(unmod.proteoforms))
    unmod.proteoforms$PTMType <- vector(mode = "list", length = nrow(unmod.proteoforms))

    message(
      "  - A fraction of ", nrow(unmod.proteoforms),
      " modified protein sequences will maintain their unmodified counterpart ",
      paste0("(", (1 - parameters$RemoveNonModFormFrac) * 100, "%).")
    )

    return(list(mod.proteoforms = mod.proteoforms, unmod.proteoforms = unmod.proteoforms))
  } else {
    message(" + No modifications selected.")
    return(list(mod.proteoforms = NULL, unmod.proteoforms = NULL))
  }
}


####################
#' Modify single sequence
#'
#' This function modified based on the modification type and residue frequencies.
#'
#' @param seq A character vector of protein sequences to be modified.
#' @param pars A list of parameters for the modification process, including:
#'  \itemize{
#'  \item \code{ModifiableResidues}: A list of amino acid residues that can be modified for each PTM type.
#'  \item \code{PTMTypes}: A character vector of modification types.
#'  \item \code{PTMTypesDistr}: A numeric vector representing the background frequencies for each PTM type.
#'  \item \code{ModifiableResiduesDistr}: A list of numeric vectors representing the background frequencies
#'  for each modifiable residue within each PTM type.
#'  \item \code{PTMMultipleLambda}: A numeric value specifying the lambda parameter for the truncated Poisson
#'  distribution (truncated to [1, N]) controlling the number of modification sites per proteoform.
#'  Setting to \code{0} (or any value \eqn{\le 10^{-4}}) is treated as \code{1e-4} internally, which causes
#'  the truncated Poisson to almost surely return 1 — yielding approximately one modified site per PTM type
#'  per proteoform.  Larger values increase the expected number of modifications.
#'  }
#'
#'
#'  keywords internal
modify_seq <- function(seq, pars) {
  # Simplify parameters
  pmod_res <- pars[[1]]
  ptms <- pars[[2]]
  pmod_res_distr <- pars[[3]]
  ptms_distr <- pars[[4]]
  # Clamp lambda: values <= 1e-4 (including 0) cause the truncated Poisson to
  # almost surely return 1, giving approximately one modified site per PTM type.
  # This matches the user-visible contract that PTMMultipleLambda = 0 yields 1 site.
  lambda <- max(pars[[5]], 1e-4)

  seq_string <- strsplit(seq, split = "")

  # Run over modifications
  out <- setNames(
    lapply(ptms, function(x) {
      weight <- pmod_res_distr[[x]]
      names(weight) <- pmod_res[[x]]
      weight[!is.finite(weight)] <- 0

      # All modifiable residues
      all_positions <- setNames(lapply(pmod_res[[x]], function(y) which(y == seq_string[[1]])), pmod_res[[x]])

      # Adjust by residue frequency in protein
      weight <- sapply(pmod_res[[x]], function(y) {
        ttt <- weight[[y]] / length(all_positions[[y]]) * length(unlist(all_positions))
        ttt[ttt > 1] <- 1
        return(ttt)
      })

      # Run over residue type to get modified residues
      out <- lapply(pmod_res[[x]], function(y) {
        # Find positions of residues
        positions <- all_positions[[y]]
        # residues <- seq_string[[1]][positions]
        pos_len <- length(positions)
        # no modification site
        if (pos_len == 0) {
          return(NULL)
        }
        # getting modified sites
        out <- NULL
        if (pos_len > 1) {
          out <- sample(
            x = positions,
            size = weight[[y]] * extraDistr::rtpois(
              n = 1, lambda = lambda * pos_len,
              a = 1, b = pos_len
            )
          )
        } else if (runif(1) < weight[[y]]) {
          out <- positions
        }
        return(out)
      })
      names(out) <- pmod_res[[x]]
      nummod.per.residue <- lapply(out, function(y) {
        length(y)
      })
      return(list(modified = unlist(out), count = nummod.per.residue))
    }), ptms
  )
  nummod.per.ptm <- lapply(out, function(x) {
    x$count
  })
  names(nummod.per.ptm) <- ptms
  Types <- unlist(lapply(ptms, function(x) rep(x, sum(unlist(nummod.per.ptm[[x]])))))
  Positions <- unlist(lapply(ptms, function(x) unlist(out[[x]]$modified)))
  if (length(Positions) > 0) {
    ord <- order(Positions, Types)
    Positions <- Positions[ord]
    Types <- Types[ord]
  }
  return(list(Positions = Positions, Types = Types, Count = nummod.per.ptm))
}


#####################

#' Modify Protein Sequences Based on Modification Type and Residue Frequencies
#'
#' This function modifies protein sequences by introducing post-translational modifications (PTMs) based on
#' specified modification types and residue-specific background frequencies. The function identifies all
#' possible modification sites within each sequence, adjusts the probability of modification according to
#' the global background frequencies, and then samples the modification sites based on these probabilities.
#'
#' The number of modifications per sequence is determined using a truncated Poisson distribution, ensuring
#' a realistic distribution of modifications. The function returns a report detailing the modification positions,
#' types of modifications, and statistics on the number of modified amino acids per modification type.
#'
#' @param seq A character vector of protein sequences to be modified.
#' @param param A list of parameters for the modification process, including:
#'   \itemize{
#'     \item \code{ModifiableResidues}: A list of amino acid residues that can be modified for each PTM type.
#'     \item \code{PTMTypes}: A character vector of modification types.
#'     \item \code{PTMTypesDistr}: A numeric vector representing the background frequencies for each PTM type.
#'     \item \code{ModifiableResiduesDistr}: A list of numeric vectors representing the background frequencies
#'     for each modifiable residue within each PTM type.
#'     \item \code{PTMMultipleLambda}: A numeric value specifying the lambda parameter for the truncated Poisson
#'     distribution, which determines the number of modifications per sequence.
#'   }
#'
#' @return A list of lists, where each sublist corresponds to a sequence and contains:
#'   \itemize{
#'     \item \code{Positions}: A numeric vector of positions in the sequence where modifications were made.
#'     \item \code{Types}: A character vector of the modification types applied at each position.
#'     \item \code{Count}: A list of numeric vectors representing the count of modifications per amino acid residue
#'     for each PTM type.
#'   }
#'
#' @details
#' The function performs the following steps:
#' \enumerate{
#'   \item Identifies candidate modification sites in each sequence based on the specified modifiable residues.
#'   \item Adjusts the probability weights for each modification type and candidate residue according to the
#'   background frequencies and the composition of each sequence.
#'   \item Samples the modification sites based on the adjusted probabilities and a truncated Poisson distribution.
#'   \item Generates a report for each sequence, detailing the modification positions, types, and the number
#'   of modified amino acids per modification type.
#' }
#'
#' @keywords internal
modify <- function(seq, param) {
  # Simplify parameters
  pmod_res <- param$ModifiableResidues[[1]]
  pmod_res_distr <- param$ModifiableResiduesDistr[[1]]
  ptms <- param$PTMTypes[[1]]
  # Accept both formats:
  #   wrapped:  list(mods = c(ph = 1))  → [[1]] already gives c(ph = 1)
  #   flat:     list(ph = 1)            → [[1]] gives unnamed scalar; recover via unlist()
  ptms_distr <- param$PTMTypesDistr[[1]]
  if (is.null(names(ptms_distr))) {
    ptms_distr <- unlist(param$PTMTypesDistr)
  }
  # In case of multiple modification types, amino acid background frequences
  # for each type are proportionally adjusted to background frequences of modification types.
  pmod_res_distr <- setNames(lapply(
    ptms,
    function(x) {
      unlist(pmod_res_distr[[x]]) *
        ptms_distr[[x]]
    }
  ), ptms)

  pars <- list(pmod_res, ptms, pmod_res_distr, ptms_distr, param$PTMMultipleLambda)
  # Run over all sequences
  reported.modifications <- lapply(seq, function(x) modify_seq(x, pars))

  # Combine all lists into a single data frame
  reported.modifications <- data.frame(
    Positions = I(lapply(reported.modifications, "[[", "Positions")),
    Types = I(lapply(reported.modifications, "[[", "Types")),
    Counts = I(lapply(reported.modifications, "[[", "Count"))
  )

  return(reported.modifications)
}
#####################

#' Prepare a Sample of Proteoforms
#'
#' This function generates a set of proteoforms by using the `proteinInput` and `performModification` functions.
#' It returns a data frame that includes unmodified proteins, modified proteoforms, and modifiable but unmodified
#' proteins.
#'
#'
#' @param parameters A list of parameters required for sample preparation, including paths to input files and
#' modification details.
#'
#' @return A data frame containing all proteoforms generated, including:
#' \itemize{
#'   \item Unmodified proteins,
#'   \item Modified proteoforms,
#'   \item Modifiable but unmodified proteins.
#' }
#'
#' @details
#' The function performs the following steps:
#' \enumerate{
#'   \item Imports the protein sequences using `proteinInput`.
#'   \item Modifies the selected proteins using `performModification`.
#'   \item Combines unmodified and modified proteoforms into a single data frame.
#' }
#'
#' @keywords internal
samplePreparation <- function(parameters) {
  message("#SAMPLE PREPARATION - Start\n")

  protein.Sets <- proteinInput(parameters = parameters)

  if (is.null(protein.Sets$to.Modify) & is.null(protein.Sets$to.be.Unmodified)) {
    message(crayon::red("#SAMPLE PREPARATION - Finish (Warning: Empty sets)"))
    proteoforms <- NULL
  } else {
    if (!is.null(protein.Sets$to.Modify)) {
      proteoforms.after.modification <- performModification(to.Modify = protein.Sets$to.Modify, parameters = parameters)

      proteoforms <- rbind(
        protein.Sets$to.be.Unmodified,
        proteoforms.after.modification$mod.proteoform,
        proteoforms.after.modification$unmod.proteoforms
      )

      proteoforms$Proteoform_ID <- make.unique(proteoforms$Accession)
      proteoforms$Proteoform_ID <-
        rowSums(cbind(
          as.numeric(sub("\\.", "", sub("^[^.]*", "", proteoforms$Proteoform_ID))),
          rep(1, nrow(proteoforms))
        ), na.rm = TRUE)
      proteoforms <- proteoforms[, c(1, 2, 5, 3, 4)]

      rownames(proteoforms) <- seq_len(nrow(proteoforms))

      message("#SAMPLE PREPARATION - Finish\n")
    } else {
      proteoforms <- protein.Sets$to.be.Unmodified
      rownames(proteoforms) <- seq_len(nrow(proteoforms))
      proteoforms$PTMPos <- vector(mode = "list", length = nrow(proteoforms))
      proteoforms$PTMType <- vector(mode = "list", length = nrow(proteoforms))
      proteoforms$Proteoform_ID <- rep(1, nrow(proteoforms))
      proteoforms <- proteoforms[, c(1, 2, 5, 3, 4)]

      message("#SAMPLE PREPARATION - Finish\n")
    }
  }

  return(proteoforms)
}
#####################

#' Create a Regulation Pattern
#'
#' This function generates a regulation pattern for a given number of conditions. The pattern
#' consists of random values representing up-regulation or down-regulation for each condition.
#'
#' @param NumCond An integer specifying the number of conditions.
#'
#' @return A numeric vector of length `NumCond` representing the regulation pattern.
#'
#'
#' @keywords internal
createRegulationPattern <- function(NumCond) {
  # select 0.5 because division by 2 is already induced this way
  # this ensures that the differentiation amplitude is as speciefied by the user
  regulation_direction <- sample(unlist(lapply(
    seq_len(NumCond),
    function(x) c(0.5, -0.5) * x
  ))[seq_len(NumCond)], size = NumCond)
  return(regulation_direction)
}
#####################

#' Add Abundance Values to Proteoforms
#'
#' This function adds abundance values to proteoforms based on a combination of random noise and
#' differentially regulated patterns. It supports both relative and absolute quantification.
#'
#' @param proteoforms A data frame of proteoforms to which abundance values will be added.
#' @param parameters A list of parameters controlling the abundance assignment, including the
#' number of conditions, quantification columns, noise levels, and regulation patterns.
#'
#' @return A data frame with abundance values added to the proteoforms.
#'
#' @details
#' The function performs the following steps:
#' \enumerate{
#'   \item Adds random noise to the abundance columns.
#'   \item If specified, selects differentially regulated proteoforms and adjusts their abundance
#'   based on regulation patterns.
#'   \item Optionally, removes values below a threshold or adjusts values to an absolute quantification scale.
#' }
#'
#' @keywords internal
addProteoformAbundance <- function(proteoforms, parameters) {
  # populate the matrix with random noise
  for (name in parameters$QuantColnames) {
    proteoforms[name] <- rnorm(n = nrow(proteoforms), mean = 0, sd = parameters$QuantNoise)
  }

  if (is.na(parameters$UserInputFoldChanges_NumRegProteoforms) || is.na(parameters$UserInputFoldChanges_RegulationFC) ||
        parameters$UserInputFoldChanges_NumRegProteoforms == 0 || parameters$UserInputFoldChanges_RegulationFC == 0) {
    if (parameters$DiffRegFrac != 0) {
      # select differentially regulated proteoforms
      diff_reg_indices <- sample(seq_len(nrow(proteoforms)), size = parameters$DiffRegFrac * nrow(proteoforms))

      # print(diff_reg_indices)

      # determine amplitude of regulation for regulated proteoforms
      proteoforms[diff_reg_indices, "Regulation_Amplitude"] <- runif(
        min = 0,
        max = parameters$DiffRegMax,
        n = length(diff_reg_indices)
      )

      regulationPatterns <- lapply(
        seq_len(length(diff_reg_indices)),
        function(x) createRegulationPattern(parameters$NumCond)
      )
    } else {
      proteoforms$Regulation_Amplitude <- vector(mode = "list", length = nrow(proteoforms))
      regulationPatterns <- NULL
    }
  } else {
    # select differentially regulated proteoforms

    diff_reg_indices <- sample(seq_len(nrow(proteoforms)),
      size = sum(parameters$UserInputFoldChanges_NumRegProteoforms)
    )

    # determine amplitude of regulation for regulated proteoforms
    proteoforms[diff_reg_indices, "Regulation_Amplitude"] <- parameters$UserInputFoldChanges_RegulationFC


    regulationPatterns <- lapply(
      seq_len(length(diff_reg_indices)),
      function(x) createRegulationPattern(parameters$NumCond)
    )
  }

  proteoforms$Regulation_Pattern <- vector(mode = "list", length = nrow(proteoforms))

  # print(length(regulationPatterns))
  # print(regulationPatterns)

  if (!is.null(regulationPatterns)) {
    proteoforms$Regulation_Pattern[diff_reg_indices] <- regulationPatterns
    # [diff_reg_indices, "Regulation_Pattern"]
    proteoforms[diff_reg_indices, parameters$QuantColnames] <-
      # add regulation pattern*regulation amplitude to random noise
      proteoforms[diff_reg_indices, parameters$QuantColnames, drop = FALSE] +

      # generate regulation patterns for all regulated proteoforms
      t(sapply(seq_len(length(diff_reg_indices)), function(x) {
        rep(regulationPatterns[[x]], each = parameters$NumReps)

        # multiply regulation pattern with Regulation amplitude
      })) * (proteoforms[diff_reg_indices, "Regulation_Amplitude"])
  } else {
    proteoforms$Regulation_Pattern <- vector(mode = "list", length = nrow(proteoforms))
  }

  message(" + Add quan. distribution: Relative -> absolute")
  # Distribute a protein-level abundance draw across its proteoforms
  prot_index <- split(seq_len(nrow(proteoforms)), proteoforms$Accession)
  abs_vec <- numeric(nrow(proteoforms))
  for (indices in prot_index) {
    total_abundance <- rnorm(n = 1, mean = parameters$AbsoluteQuanMean, sd = parameters$AbsoluteQuanSD)
    weights <- rexp(n = length(indices))
    weights <- weights / sum(weights)
    contribution <- log2(2^total_abundance * weights)
    for (name in parameters$QuantColnames) {
      proteoforms[indices, name] <- proteoforms[indices, name] + contribution
    }
    abs_vec[indices] <- contribution
  }
  if (parameters$ThreshRemoveProteoforms > 0) {
    # Remove Values below the threshold set in the Parameters file
    thresh <- parameters$ThreshRemoveProteoforms
    proteoforms[, parameters$QuantColnames][proteoforms[, parameters$QuantColnames] < thresh] <- NA
  }

  return(proteoforms)
}
#####################