mvcgi/parser.py at master · sirloon/mvcgi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import unicodedata
from collections import defaultdict

from csv import DictReader
from biothings.utils.dataload import dict_sweep, open_anyfile


def load_data(data_folder):

    input_file = os.path.join(data_folder,"cgi_biomarkers_per_variant.tsv")
    assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split('\t')
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter='\t')

        results = defaultdict(list)
        for row in reader:

            variant = {}

            # Skip
            if 'gDNA' not in row or row['gDNA'] == "":
                continue

            # Skip variants that are not mutations
            if 'Alteration type' not in row or row['Alteration type'] != 'MUT':
                continue

            # Use gDNA as variant identifier
            variant['_id'] = row['gDNA']
            variant['cgi'] = {}

            for k in [
                'region', 'cDNA', 'Evidence level', 'transcript', 'Gene', ('individual_mutation', 'protein_change'), 'Primary Tumor type',
                ('Drug full name', 'drug'), 'Source', 'Association']:

                if isinstance(k, tuple):
                    new_k = k[1]
                    old_k = k[0]
                else:
                    new_k = k.lower().replace(' ', '_')
                    old_k = k

                variant['cgi'][new_k] = unicodedata.normalize("NFKD", row.get(old_k, None))

            variant = dict_sweep(variant, vals=['', 'null', 'N/A', None, [], {}])
            results[variant['_id']].append(variant)

        # Merge duplications
        for v in results.values():
            if len(v) == 1:
                yield v[0]
            else:
                yield {
                    '_id': v[0]['_id'],
                    'cgi': [i['cgi'] for i in v]
                }