ms_autoqc.DatabaseFunctions

   1import traceback
   2import warnings
   3warnings.simplefilter(action="ignore", category=FutureWarning)
   4
   5import os, io, shutil, time
   6import hashlib, json, ast
   7import pandas as pd
   8import numpy as np
   9import sqlalchemy as sa
  10from pydrive2.auth import GoogleAuth
  11from pydrive2.drive import GoogleDrive
  12from sqlalchemy import INTEGER, REAL, TEXT
  13import base64
  14from email.message import EmailMessage
  15import google.auth as google_auth
  16from googleapiclient.discovery import build
  17from googleapiclient.errors import HttpError
  18
  19# Set ms_autoqc/src as the working directory
  20src_folder = os.path.dirname(os.path.realpath(__file__))
  21os.chdir(src_folder)
  22
  23# Initialize directories
  24root_directory = os.getcwd()
  25data_directory = os.path.join(root_directory, "data")
  26methods_directory = os.path.join(data_directory, "methods")
  27auth_directory = os.path.join(root_directory, "auth")
  28
  29# Location of settings SQLite database
  30settings_database = "sqlite:///data/methods/Settings.db"
  31settings_db_file = os.path.join(methods_directory, "Settings.db")
  32
  33# Google Drive authentication files
  34credentials_file = os.path.join(auth_directory, "credentials.txt")
  35alt_credentials = os.path.join(auth_directory, "email_credentials.txt")
  36drive_settings_file = os.path.join(auth_directory, "settings.yaml")
  37auth_container = [GoogleAuth(settings_file=drive_settings_file)]
  38
  39"""
  40The functions defined below operate on two database types:
  41
  42- One storing instrument run metadata, sample QC results, and biological standard QC results
  43- The other storing instrument metadata, workspace settings for workspace access, chromatography methods, 
  44biological standards, QC configurations, and MS-DIAL configurations
  45  
  46In addition, this file also contains methods for syncing data and settings with Google Drive.
  47To get an overview of all functions, please visit the documentation on https://czbiohub.github.io/MS-AutoQC.
  48"""
  49
  50def get_database_file(instrument_id, sqlite_conn=False, zip=False):
  51
  52    """
  53    Returns database file for a given instrument ID.
  54
  55    Args:
  56        instrument_id (str):
  57            Instrument ID that specifies which database file to retrieve
  58        sqlite_conn (bool, default False):
  59            Whether to receive the path for establishing a SQLite connection
  60        zip (bool, default False):
  61            Whether to receive the path of the database file in the local app directory
  62
  63    Returns:
  64        str: Path for the database file
  65    """
  66
  67    if zip:
  68        filename = instrument_id.replace(" ", "_") + ".zip"
  69    else:
  70        filename = instrument_id.replace(" ", "_") + ".db"
  71
  72    if sqlite_conn:
  73        return "sqlite:///data/" + filename
  74    else:
  75        return os.path.join(data_directory, filename)
  76
  77
  78def connect_to_database(name):
  79
  80    """
  81    Establishes a connection to a SQLite database of choice
  82
  83    Args:
  84        name (str):
  85            Name of the database, either "Settings" or an instrument ID
  86
  87    Returns:
  88        sqlalchemy.MetaData:
  89            A container object that consists of different features of a database being described
  90        sqlalchemy.Connection:
  91            An object that represents a single DBAPI connection, and always emits SQL statements within
  92            the context of a transaction block
  93    """
  94
  95    if name == "Settings":
  96        database_file = settings_database
  97    else:
  98        database_file = get_database_file(instrument_id=name, sqlite_conn=True)
  99
 100    engine = sa.create_engine(database_file)
 101    db_metadata = sa.MetaData(bind=engine)
 102    connection = engine.connect()
 103
 104    return db_metadata, connection
 105
 106
 107def create_databases(instrument_id, new_instrument=False):
 108
 109    """
 110    Initializes SQLite databases for 1) instrument data and 2) workspace settings.
 111
 112    Creates the following tables in the instrument database: "runs", "bio_qc_results", "sample_qc_results".
 113
 114    Creates the following tables in the settings database: "biological_standards", "chromatography_methods",
 115    "email_notifications", "instruments", "gdrive_users", "internal_standards", "msdial_parameters", "qc_parameters",
 116    "targeted_features", "workspace".
 117
 118    Args:
 119        instrument_id (str):
 120            Instrument ID to name the new database ("Thermo QE 1" becomes "Thermo_QE_1.db")
 121        new_instrument (bool, default False):
 122            Whether a new instrument database is being added to a workspace, or whether a new
 123            instrument database AND settings database are being created for the first time
 124
 125    Returns:
 126        None
 127    """
 128
 129    # Create tables for instrument database
 130    instrument_database = get_database_file(instrument_id=instrument_id, sqlite_conn=True)
 131    qc_db_engine = sa.create_engine(instrument_database)
 132    qc_db_metadata = sa.MetaData()
 133
 134    bio_qc_results = sa.Table(
 135        "bio_qc_results", qc_db_metadata,
 136        sa.Column("id", INTEGER, primary_key=True),
 137        sa.Column("sample_id", TEXT),
 138        sa.Column("run_id", TEXT),
 139        sa.Column("polarity", TEXT),
 140        sa.Column("precursor_mz", TEXT),
 141        sa.Column("retention_time", TEXT),
 142        sa.Column("intensity", TEXT),
 143        sa.Column("md5", TEXT),
 144        sa.Column("qc_dataframe", TEXT),
 145        sa.Column("qc_result", TEXT),
 146        sa.Column("biological_standard", TEXT),
 147        sa.Column("position", TEXT)
 148    )
 149
 150    runs = sa.Table(
 151        "runs", qc_db_metadata,
 152        sa.Column("id", INTEGER, primary_key=True),
 153        sa.Column("run_id", TEXT),
 154        sa.Column("chromatography", TEXT),
 155        sa.Column("acquisition_path", TEXT),
 156        sa.Column("sequence", TEXT),
 157        sa.Column("metadata", TEXT),
 158        sa.Column("status", TEXT),
 159        sa.Column("samples", INTEGER),
 160        sa.Column("completed", INTEGER),
 161        sa.Column("passes", INTEGER),
 162        sa.Column("fails", INTEGER),
 163        sa.Column("latest_sample", TEXT),
 164        sa.Column("qc_config_id", TEXT),
 165        sa.Column("biological_standards", TEXT),
 166        sa.Column("pid", INTEGER),
 167        sa.Column("drive_id", TEXT),
 168        sa.Column("sample_status", TEXT),
 169        sa.Column("job_type", TEXT)
 170    )
 171
 172    sample_qc_results = sa.Table(
 173        "sample_qc_results", qc_db_metadata,
 174        sa.Column("id", INTEGER, primary_key=True),
 175        sa.Column("sample_id", TEXT),
 176        sa.Column("run_id", TEXT),
 177        sa.Column("polarity", TEXT),
 178        sa.Column("position", TEXT),
 179        sa.Column("md5", TEXT),
 180        sa.Column("precursor_mz", TEXT),
 181        sa.Column("retention_time", TEXT),
 182        sa.Column("intensity", TEXT),
 183        sa.Column("qc_dataframe", TEXT),
 184        sa.Column("qc_result", TEXT)
 185    )
 186
 187    qc_db_metadata.create_all(qc_db_engine)
 188
 189    # If only creating instrument database, save and return here
 190    if new_instrument:
 191        set_device_identity(is_instrument_computer=True, instrument_id=instrument_id)
 192        return None
 193
 194    # Create tables for Settings.db
 195    settings_db_engine = sa.create_engine(settings_database)
 196    settings_db_metadata = sa.MetaData()
 197
 198    instruments = sa.Table(
 199        "instruments", settings_db_metadata,
 200        sa.Column("id", INTEGER, primary_key=True),
 201        sa.Column("name", TEXT),
 202        sa.Column("vendor", TEXT),
 203        sa.Column("drive_id", TEXT),
 204        sa.Column("last_modified", TEXT)
 205    )
 206
 207    biological_standards = sa.Table(
 208        "biological_standards", settings_db_metadata,
 209        sa.Column("id", INTEGER, primary_key=True),
 210        sa.Column("name", TEXT),
 211        sa.Column("identifier", TEXT),
 212        sa.Column("chromatography", TEXT),
 213        sa.Column("num_pos_features", INTEGER),
 214        sa.Column("num_neg_features", INTEGER),
 215        sa.Column("pos_bio_msp_file", TEXT),
 216        sa.Column("neg_bio_msp_file", TEXT),
 217        sa.Column("pos_parameter_file", TEXT),
 218        sa.Column("neg_parameter_file", TEXT),
 219        sa.Column("msdial_config_id", TEXT)
 220    )
 221
 222    chromatography_methods = sa.Table(
 223        "chromatography_methods", settings_db_metadata,
 224        sa.Column("id", INTEGER, primary_key=True),
 225        sa.Column("method_id", TEXT),
 226        sa.Column("num_pos_standards", INTEGER),
 227        sa.Column("num_neg_standards", INTEGER),
 228        sa.Column("pos_istd_msp_file", TEXT),
 229        sa.Column("neg_istd_msp_file", TEXT),
 230        sa.Column("pos_parameter_file", TEXT),
 231        sa.Column("neg_parameter_file", TEXT),
 232        sa.Column("msdial_config_id", TEXT)
 233    )
 234
 235    gdrive_users = sa.Table(
 236        "gdrive_users", settings_db_metadata,
 237        sa.Column("id", INTEGER, primary_key=True),
 238        sa.Column("name", TEXT),
 239        sa.Column("email_address", TEXT),
 240        sa.Column("permission_id", TEXT),
 241    )
 242
 243    internal_standards = sa.Table(
 244        "internal_standards", settings_db_metadata,
 245        sa.Column("id", INTEGER, primary_key=True),
 246        sa.Column("name", TEXT),
 247        sa.Column("chromatography", TEXT),
 248        sa.Column("polarity", TEXT),
 249        sa.Column("precursor_mz", REAL),
 250        sa.Column("retention_time", REAL),
 251        sa.Column("ms2_spectrum", TEXT),
 252        sa.Column("inchikey", TEXT)
 253    )
 254
 255    msdial_parameters = sa.Table(
 256        "msdial_parameters", settings_db_metadata,
 257        sa.Column("id", INTEGER, primary_key=True),
 258        sa.Column("config_name", TEXT),
 259        sa.Column("rt_begin", INTEGER),
 260        sa.Column("rt_end", INTEGER),
 261        sa.Column("mz_begin", INTEGER),
 262        sa.Column("mz_end", INTEGER),
 263        sa.Column("ms1_centroid_tolerance", REAL),
 264        sa.Column("ms2_centroid_tolerance", REAL),
 265        sa.Column("smoothing_method", TEXT),
 266        sa.Column("smoothing_level", INTEGER),
 267        sa.Column("min_peak_width", INTEGER),
 268        sa.Column("min_peak_height", INTEGER),
 269        sa.Column("mass_slice_width", REAL),
 270        sa.Column("post_id_rt_tolerance", REAL),
 271        sa.Column("post_id_mz_tolerance", REAL),
 272        sa.Column("post_id_score_cutoff", REAL),
 273        sa.Column("alignment_rt_tolerance", REAL),
 274        sa.Column("alignment_mz_tolerance", REAL),
 275        sa.Column("alignment_rt_factor", REAL),
 276        sa.Column("alignment_mz_factor", REAL),
 277        sa.Column("peak_count_filter", INTEGER),
 278        sa.Column("qc_at_least_filter", TEXT)
 279    )
 280
 281    email_notifications = sa.Table(
 282        "email_notifications", settings_db_metadata,
 283        sa.Column("id", INTEGER, primary_key=True),
 284        sa.Column("email_address", TEXT),
 285    )
 286
 287    qc_parameters = sa.Table(
 288        "qc_parameters", settings_db_metadata,
 289        sa.Column("id", INTEGER, primary_key=True),
 290        sa.Column("config_name", TEXT),
 291        sa.Column("intensity_dropouts_cutoff", INTEGER),
 292        sa.Column("library_rt_shift_cutoff", REAL),
 293        sa.Column("in_run_rt_shift_cutoff", REAL),
 294        sa.Column("library_mz_shift_cutoff", REAL),
 295        sa.Column("intensity_enabled", INTEGER),
 296        sa.Column("library_rt_enabled", INTEGER),
 297        sa.Column("in_run_rt_enabled", INTEGER),
 298        sa.Column("library_mz_enabled", INTEGER)
 299    )
 300
 301    targeted_features = sa.Table(
 302        "targeted_features", settings_db_metadata,
 303        sa.Column("id", INTEGER, primary_key=True),
 304        sa.Column("name", TEXT),
 305        sa.Column("chromatography", TEXT),
 306        sa.Column("polarity", TEXT),
 307        sa.Column("biological_standard", TEXT),
 308        sa.Column("precursor_mz", REAL),
 309        sa.Column("retention_time", REAL),
 310        sa.Column("ms2_spectrum", TEXT),
 311        sa.Column("inchikey", TEXT)
 312    )
 313
 314    workspace = sa.Table(
 315        "workspace", settings_db_metadata,
 316        sa.Column("id", INTEGER, primary_key=True),
 317        sa.Column("slack_bot_token", TEXT),
 318        sa.Column("slack_channel", TEXT),
 319        sa.Column("slack_enabled", INTEGER),
 320        sa.Column("gdrive_folder_id", TEXT),
 321        sa.Column("methods_zip_file_id", TEXT),
 322        sa.Column("methods_last_modified", TEXT),
 323        sa.Column("msdial_directory", TEXT),
 324        sa.Column("is_instrument_computer", INTEGER),
 325        sa.Column("instrument_identity", TEXT)
 326    )
 327
 328    # Insert tables into database
 329    settings_db_metadata.create_all(settings_db_engine)
 330
 331    # Insert default configurations for MS-DIAL and MS-AutoQC
 332    add_msdial_configuration("Default")
 333    add_qc_configuration("Default")
 334
 335    # Initialize workspace metadata
 336    create_workspace_metadata()
 337
 338    # Save device identity based on setup values
 339    set_device_identity(is_instrument_computer=True, instrument_id=instrument_id)
 340    return None
 341
 342
 343def execute_vacuum(database):
 344
 345    """
 346    Executes VACUUM command on the database of choice.
 347
 348    Args:
 349        database (str): name of the database, either "Settings" or Instrument ID
 350
 351    Returns:
 352        None
 353    """
 354
 355    db_metadata, connection = connect_to_database(database)
 356    connection.execute("VACUUM")
 357    connection.close()
 358
 359
 360def get_drive_instance():
 361
 362    """
 363    Returns user-authenticated Google Drive instance.
 364    """
 365
 366    return GoogleDrive(auth_container[0])
 367
 368
 369def launch_google_drive_authentication():
 370
 371    """
 372    Launches Google Drive authentication flow and sets authentication instance.
 373    """
 374
 375    auth_container[0] = GoogleAuth(settings_file=drive_settings_file)
 376    auth_container[0].LocalWebserverAuth()
 377
 378
 379def save_google_drive_credentials():
 380
 381    """
 382    Saves Google credentials to a credentials.txt file.
 383    """
 384
 385    auth_container[0].SaveCredentialsFile(credentials_file)
 386
 387
 388def initialize_google_drive():
 389
 390    """
 391    Initializes instance of Google Drive using credentials.txt and settings.yaml in /auth directory
 392
 393    Args:
 394        None
 395
 396    Returns:
 397        bool: Whether the Google client credentials file (in the "auth" directory) exists.
 398    """
 399
 400    # Create Google Drive instance
 401    auth_container[0] = GoogleAuth(settings_file=drive_settings_file)
 402    gauth = auth_container[0]
 403
 404    # If no credentials file, make user authenticate
 405    if not os.path.exists(credentials_file) and is_valid():
 406        gauth.LocalWebserverAuth()
 407
 408    # Try to load saved client credentials
 409    gauth.LoadCredentialsFile(credentials_file)
 410
 411    # Initialize saved credentials
 412    if gauth.credentials is not None:
 413
 414        # Refresh credentials if expired
 415        if gauth.access_token_expired:
 416            gauth.Refresh()
 417
 418        # Otherwise, authorize saved credentials
 419        else:
 420            gauth.Authorize()
 421
 422    # If no saved credentials, make user authenticate again
 423    elif gauth.credentials is None:
 424        gauth.LocalWebserverAuth()
 425
 426    if not os.path.exists(credentials_file) and is_valid():
 427        save_google_drive_credentials()
 428
 429    # Makes small modification for emails (for usage with Google's google.auth)
 430    if not os.path.exists(alt_credentials):
 431        data = None
 432        with open(credentials_file, "r") as file:
 433            data = json.load(file)
 434            data["type"] = "authorized_user"
 435        with open(alt_credentials, "w") as file:
 436            json.dump(data, file)
 437
 438    return os.path.exists(credentials_file)
 439
 440
 441def is_valid(instrument_id=None):
 442
 443    """
 444    Checks that all required tables in all databases (or a single database of choice) are present.
 445
 446    Args:
 447        instrument_id (str, default None):
 448            Specified if validating a specific database
 449
 450    Returns:
 451        None
 452    """
 453
 454    # Validate settings database
 455    settings_db_required_tables = ["biological_standards", "chromatography_methods", "email_notifications", "instruments",
 456        "gdrive_users", "internal_standards", "msdial_parameters", "qc_parameters", "targeted_features", "workspace"]
 457
 458    try:
 459        settings_db_tables = sa.create_engine(settings_database).table_names()
 460        if len(settings_db_tables) < len(settings_db_required_tables):
 461            return False
 462    except:
 463        return False
 464
 465    # Validate instrument databases
 466    instrument_db_required_tables = ["bio_qc_results", "runs", "sample_qc_results"]
 467
 468    # If given an instrument ID, only validate that instrument's database
 469    try:
 470        if instrument_id is not None:
 471            database = get_database_file(instrument_id, sqlite_conn=True)
 472            instrument_db_tables = sa.create_engine(database).table_names()
 473            if len(instrument_db_tables) < len(instrument_db_required_tables):
 474                return False
 475
 476        # Otherwise, validate all instrument databases
 477        else:
 478            database_files = [file.replace(".db", "") for file in os.listdir(data_directory) if ".db" in file and "journal.db" not in file]
 479            databases = [get_database_file(f, sqlite_conn=True) for f in database_files]
 480
 481            for database in databases:
 482                instrument_db_tables = sa.create_engine(database).table_names()
 483                if len(instrument_db_tables) < len(instrument_db_required_tables):
 484                    return False
 485    except:
 486        return False
 487
 488    return True
 489
 490
 491def sync_is_enabled():
 492
 493    """
 494    Checks whether Google Drive sync is enabled simply by querying whether Google Drive ID's exist in the database.
 495
 496    Typically used for separating sync-specific functionality.
 497
 498    Returns:
 499        bool: Whether Google Drive sync is enabled or not
 500    """
 501
 502    if not is_valid():
 503        return False
 504
 505    df_workspace = get_table("Settings", "workspace")
 506    gdrive_folder_id = df_workspace["gdrive_folder_id"].values[0]
 507    methods_zip_file_id = df_workspace["methods_zip_file_id"].values[0]
 508
 509    if gdrive_folder_id is not None and methods_zip_file_id is not None:
 510        if gdrive_folder_id != "None" and methods_zip_file_id != "None":
 511            if gdrive_folder_id != "" and methods_zip_file_id != "":
 512                return True
 513
 514    return False
 515
 516
 517def email_notifications_are_enabled():
 518
 519    """
 520    Checks whether email notifications are enabled.
 521
 522    Returns True if databases are valid, Google Drive sync is enabled, and if email addresses were
 523    registered by user in Settings > General. Returns False if any condition is not met.
 524
 525    Returns:
 526        bool: True if email notifications are enabled, False if not
 527    """
 528
 529    if not is_valid():
 530        return False
 531
 532    if not sync_is_enabled():
 533        return False
 534
 535    if len(get_table("Settings", "email_notifications")) > 0:
 536        return True
 537
 538    return False
 539
 540
 541def slack_notifications_are_enabled():
 542
 543    """
 544    Checks whether Slack notifications are enabled.
 545
 546    Returns True if user enabled Slack notifications in Settings > General, and False if not.
 547
 548    Returns:
 549        bool: True if Slack notifications are enabled, False if not
 550    """
 551
 552    if not is_valid():
 553        return False
 554
 555    try:
 556        return bool(get_table("Settings", "workspace")["slack_enabled"].astype(int).tolist()[0])
 557    except:
 558        return False
 559
 560
 561def is_instrument_computer():
 562
 563    """
 564    Checks whether user's device is the instrument computer.
 565
 566    This is specified during setup. If the user created a new instrument, or signed in as an instrument device, then
 567    this will return True. If the user signed in to their workspace from a non-instrument device, this will return False.
 568
 569    Typically used to organize / hide UI functions for instrument and non-instrument devices
 570    that MS-AutoQC is installed on.
 571
 572    Returns:
 573        True if device is instrument computer, False if not
 574    """
 575
 576    return bool(get_table("Settings", "workspace")["is_instrument_computer"].astype(int).tolist()[0])
 577
 578
 579def get_md5_for_settings_db():
 580
 581    """
 582    Calculates and returns MD5 checksum for the settings database file.
 583
 584    Typically used for checking whether the user changed settings and prompting a Google Drive sync (if sync is enabled).
 585
 586    Returns:
 587        An MD5 checksum of /data/methods/Settings.db
 588    """
 589
 590    hash_md5 = hashlib.md5()
 591
 592    with open(settings_db_file, "rb") as f:
 593        for chunk in iter(lambda: f.read(4096), b""):
 594            hash_md5.update(chunk)
 595
 596    return hash_md5.hexdigest()
 597
 598
 599def settings_were_modified(md5_checksum):
 600
 601    """
 602    Checks whether settings database file has been modified.
 603
 604    This is done by comparing the checksum computed when Settings were opened (given as a parameter)
 605    with the checksum computed when Settings were closed (in this function call).
 606
 607    Args:
 608        md5_checksum (str):
 609            An MD5 checksum of /data/methods/Settings.db that was computed when the user opened Settings in the app
 610
 611    Returns:
 612        bool: True if checksums don't match, False if checksums match.
 613    """
 614
 615    if md5_checksum != get_md5_for_settings_db():
 616        return True
 617    else:
 618        return False
 619
 620
 621def zip_database(instrument_id=None, filename=None):
 622
 623    """
 624    Compresses instrument database file into a ZIP archive in /data directory.
 625
 626    Used for fast downloads / uploads over network connections to Google Drive (if Google Drive sync is enabled).
 627
 628    The zip archive is accessible by filename and path in the /data directory. For example, zipping
 629    the database for "Thermo QE 1" will generate a zip file with path "../data/Thermo_QE_1.zip".
 630
 631    Args:
 632        instrument_id (str, default None):
 633            If specified, selects a database to zip by instrument ID (ex: "Thermo QE 1")
 634        filename (str, default None):
 635            If specified, selects a database to zip by filename (ex: "Thermo_QE_1.zip")
 636
 637    Returns:
 638        None
 639    """
 640
 641    if instrument_id is None and filename is None:
 642        return None
 643
 644    if filename is not None:
 645        db_zip_file = os.path.join(data_directory, filename)
 646        filename = filename.replace(".zip", ".db")
 647
 648    elif instrument_id is not None:
 649        db_zip_file = get_database_file(instrument_id, zip=True)
 650        filename = instrument_id.replace(" ", "_") + ".db"
 651
 652    file_without_extension = db_zip_file.replace(".zip", "")
 653    shutil.make_archive(file_without_extension, "zip", data_directory, filename)
 654
 655
 656def unzip_database(instrument_id=None, filename=None):
 657
 658    """
 659    Unzips ZIP archive containing instrument database file and deletes the archive when complete.
 660
 661    Args:
 662        instrument_id (str, default None):
 663            If specified, selects a database to zip by instrument ID (ex: "Thermo QE 1")
 664        filename (str, default None):
 665            If specified, selects a database to zip by filename (ex: "Thermo_QE_1.zip")
 666
 667    Returns:
 668        None
 669    """
 670
 671    if instrument_id is None and filename is None:
 672        return None
 673
 674    if instrument_id is not None:
 675        db_zip_file = get_database_file(instrument_id, zip=True)
 676    elif filename is not None:
 677        db_zip_file = os.path.join(data_directory, filename)
 678
 679    shutil.unpack_archive(db_zip_file, data_directory, "zip")
 680    os.remove(db_zip_file)
 681
 682
 683def zip_methods():
 684
 685    """
 686    Compresses methods directory into a ZIP archive in /data directory.
 687
 688    Returns:
 689        Path for zip archive of methods directory (ex: "../data/methods.zip")
 690    """
 691
 692    output_directory_and_name = os.path.join(data_directory, "methods.zip").replace(".zip", "")
 693    shutil.make_archive(output_directory_and_name, "zip", methods_directory)
 694    return output_directory_and_name + ".zip"
 695
 696
 697def unzip_methods():
 698
 699    """
 700    Unzips ZIP archive containing methods directory and deletes the archive when complete.
 701    """
 702
 703    input_zip = os.path.join(data_directory, "methods.zip")
 704    shutil.unpack_archive(input_zip, methods_directory, "zip")
 705    os.remove(input_zip)
 706
 707
 708def zip_csv_files(input_directory, output_directory_and_name):
 709
 710    """
 711    Compresses CSV files into a ZIP archive in /data directory.
 712
 713    Used for fast upload of instrument run data to Google Drive during an active instrument run (if Google Drive sync is enabled).
 714
 715    Args:
 716        input_directory (str):
 717            The temporary directory for files pertaining to an instrument run, denoted as "Instrument_ID_Run_ID".
 718            For example, a job with ID "BRDE001" created under instrument with ID "Thermo QE 1" would have its files
 719            stored in "/data/Thermo_QE_1_BRDE001".
 720        output_directory_and_name (str):
 721            Essentially, the file path for the ZIP archive (ex: "/data/Instrument_ID_Run_ID").
 722
 723    Returns:
 724        Path for zip archive of CSV files with instrument run data (ex: "../data/Instrument_ID_Run_ID.zip")
 725    """
 726
 727    shutil.make_archive(output_directory_and_name, "zip", input_directory)
 728    return output_directory_and_name + ".zip"
 729
 730
 731def unzip_csv_files(input_zip, output_directory):
 732
 733    """
 734    Unzips ZIP archive of CSV files and deletes the archive upon completion.
 735    """
 736
 737    shutil.unpack_archive(input_zip, output_directory, "zip")
 738    os.remove(input_zip)
 739
 740
 741def get_table(database_name, table_name):
 742
 743    """
 744    Retrieves table from database as a pandas DataFrame object.
 745
 746    TODO: Improve this function to accept column and record queries
 747
 748    Args:
 749        database_name (str):
 750            The database to query, using instrument ID or "Settings"
 751        table_name (str):
 752            The table to retrieve
 753
 754    Returns:
 755        DataFrame of table.
 756    """
 757
 758    if database_name == "Settings":
 759        database = settings_database
 760    else:
 761        database = get_database_file(database_name, sqlite_conn=True)
 762
 763    engine = sa.create_engine(database)
 764    return pd.read_sql("SELECT * FROM " + table_name, engine)
 765
 766
 767def generate_client_settings_yaml(client_id, client_secret):
 768
 769    """
 770    Generates a settings.yaml file for Google authentication in the /auth directory.
 771
 772    Client ID and client secret are generated and provided by the user in the Google Cloud Console.
 773
 774    See: https://docs.iterative.ai/PyDrive2/oauth/#automatic-and-custom-authentication-with-settings-yaml
 775
 776    Args:
 777        client_id (str):
 778            The Client ID of the MS-AutoQC application, generated and provided by the user
 779        client_secret (str):
 780            The Client Secret of the MS-AutoQC application, generated and provided by the user
 781    Returns:
 782        None
 783    """
 784
 785    auth_directory = os.path.join(os.getcwd(), "auth")
 786    if not os.path.exists(auth_directory):
 787        os.makedirs(auth_directory)
 788
 789    settings_yaml_file = os.path.join(auth_directory, "settings.yaml")
 790
 791    lines = [
 792        "client_config_backend: settings",
 793        "client_config:",
 794        "  client_id: " + client_id,
 795        "  client_secret: " + client_secret,
 796        "\n",
 797        "save_credentials: True",
 798        "save_credentials_backend: file",
 799        "save_credentials_file: auth/credentials.txt",
 800        "\n",
 801        "get_refresh_token: True",
 802        "\n",
 803        "oauth_scope:",
 804        "  - https://www.googleapis.com/auth/drive",
 805        "  - https://www.googleapis.com/auth/gmail.send",
 806        "  - https://www.googleapis.com/auth/userinfo.email"
 807    ]
 808
 809    with open(settings_yaml_file, "w") as file:
 810        for line in lines:
 811            file.write(line)
 812            if line != "\n" and line != lines[-1]:
 813                file.write("\n")
 814
 815
 816def insert_google_drive_ids(instrument_id, gdrive_folder_id, instrument_db_file_id, methods_zip_file_id):
 817
 818    """
 819    Inserts Google Drive ID's into corresponding tables to enable Google Drive sync.
 820
 821    This function is called when a user creates a new instrument in their workspace.
 822
 823    The ID's for the following files / folders in Google Drive are stored in the database:
 824    1. MS-AutoQC folder
 825    2. Instrument database zip file
 826    3. Methods directory zip file
 827
 828    Args:
 829        instrument_id (str):
 830            Instrument ID
 831        gdrive_folder_id (str):
 832            Google Drive ID for the MS-AutoQC folder (found in the user's root directory in Drive)
 833        instrument_db_file_id (str):
 834            Google Drive ID for the instrument database ZIP file
 835        methods_zip_file_id (str):
 836            Google Drive ID for the methods directory ZIP file
 837
 838    Returns:
 839        None
 840    """
 841
 842    db_metadata, connection = connect_to_database("Settings")
 843    instruments_table = sa.Table("instruments", db_metadata, autoload=True)
 844    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
 845
 846    # Instruments database
 847    connection.execute((
 848        sa.update(instruments_table)
 849            .where((instruments_table.c.name == instrument_id))
 850            .values(drive_id=instrument_db_file_id)
 851    ))
 852
 853    # MS-AutoQC folder and Methods folder
 854    connection.execute((
 855        sa.update(workspace_table)
 856            .where((workspace_table.c.id == 1))
 857            .values(gdrive_folder_id=gdrive_folder_id,
 858                    methods_zip_file_id=methods_zip_file_id)
 859    ))
 860
 861    connection.close()
 862
 863
 864def insert_new_instrument(name, vendor):
 865
 866    """
 867    Inserts a new instrument into the "instruments" table in the Settings database.
 868
 869    The name is the instrument ID, and the vendor is one of 5 options: Thermo Fisher, Agilent, Bruker, Sciex, and Waters.
 870
 871    Args:
 872        name (str):
 873            Instrument ID
 874        vendor (str):
 875            Instrument vendor
 876
 877    Returns:
 878        None
 879    """
 880
 881    # Connect to database
 882    db_metadata, connection = connect_to_database("Settings")
 883
 884    # Get "instruments" table
 885    instruments_table = sa.Table("instruments", db_metadata, autoload=True)
 886
 887    # Prepare insert of new instrument
 888    insert_instrument = instruments_table.insert().values(
 889        {"name": name,
 890         "vendor": vendor}
 891    )
 892
 893    # Execute the insert, then close the connection
 894    connection.execute(insert_instrument)
 895    connection.close()
 896
 897
 898def get_instruments_list():
 899
 900    """
 901    Returns list of instruments in database.
 902    """
 903
 904    # Connect to SQLite database
 905    engine = sa.create_engine(settings_database)
 906
 907    # Get instruments table as DataFrame
 908    df_instruments = pd.read_sql("SELECT * FROM instruments", engine)
 909
 910    # Return list of instruments
 911    return df_instruments["name"].astype(str).tolist()
 912
 913
 914def get_instrument(instrument_id):
 915
 916    """
 917    Returns record from "instruments" table as a DataFrame for a given instrument
 918
 919    Args:
 920        instrument_id (str): Instrument ID
 921
 922    Returns:
 923        DataFrame containing the name, vendor, and drive_id for the given instrument
 924    """
 925
 926    engine = sa.create_engine(settings_database)
 927    return pd.read_sql("SELECT * FROM instruments WHERE name = '" + instrument_id + "'", engine)
 928
 929
 930def get_filenames_from_sequence(sequence, vendor="Thermo Fisher"):
 931
 932    """
 933    Filters preblanks, washes, and shutdown injections from sequence file, and simultaneously assigns
 934    polariy to each sample based on presence of "Pos" or "Neg" in Instrument Method column.
 935
 936    This function is called upon starting a new QC job.
 937
 938    TODO: Adapt this function for other instrument vendors.
 939    TODO: Check the method filename, not entire file path, for "Pos" and "Neg".
 940        A folder containing "Pos" or "Neg" will give incorrect polarity assignments.
 941
 942    Args:
 943        sequence (str):
 944            The acquisition sequence file, encoded as a JSON string in "split" format
 945        vendor (str):
 946            The instrument vendor (see to-do statements)
 947
 948    Returns:
 949        DataFrame of acquisition sequence, with preblanks / washes / shutdowns filtered out and polarities assigned
 950    """
 951
 952    df_sequence = pd.read_json(sequence, orient="split")
 953
 954    # Filter out preblanks
 955    df_sequence = df_sequence.loc[
 956        ~((df_sequence["File Name"].str.contains(r"_BK_", na=False)) &
 957          (df_sequence["File Name"].str.contains(r"_pre_", na=False)))]
 958
 959    # Filter out wash and shutdown
 960    df_sequence = df_sequence.loc[
 961        ~(df_sequence["File Name"].str.contains(r"_wash_", na=False)) &
 962        ~(df_sequence["File Name"].str.contains(r"shutdown", na=False))]
 963
 964    # Derive polarity from instrument method filename
 965    df_sequence.loc[df_sequence["Instrument Method"].str.contains(r"Pos", na=False), "Polarity"] = "Pos"
 966    df_sequence.loc[df_sequence["Instrument Method"].str.contains(r"Neg", na=False), "Polarity"] = "Neg"
 967
 968    return df_sequence
 969
 970
 971def get_polarity_for_sample(instrument_id, run_id, sample_id, status):
 972
 973    """
 974    Returns polarity for a given sample.
 975
 976    TODO: Loading hundreds of rows of data before querying for one sample is massively inefficient.
 977        This function was written in haste and can be easily implemented in a much better way.
 978
 979    Args:
 980        instrument_id (str): Instrument ID
 981        run_id (str): Instrument run ID (job ID)
 982        sample_id (str): Sample ID
 983        status (str): Job status
 984
 985    Returns:
 986        Polarity for the given sample, as either "Pos" or "Neg".
 987    """
 988
 989    if get_device_identity() != instrument_id and sync_is_enabled():
 990        if status == "Complete":
 991            df = get_samples_in_run(instrument_id, run_id, "Both")
 992        elif status == "Active":
 993            df = get_samples_from_csv(instrument_id, run_id, "Both")
 994    else:
 995        df = get_samples_in_run(instrument_id, run_id, "Both")
 996
 997    try:
 998        polarity = df.loc[df["sample_id"] == sample_id]["polarity"].astype(str).values[0]
 999    except:
1000        print("Could not find polarity for sample in database.")
1001        polarity = "Neg" if "Neg" in sample_id else "Pos"
1002
1003    return polarity
1004
1005
1006def insert_new_run(run_id, instrument_id, chromatography, bio_standards, path, sequence, metadata, qc_config_id, job_type):
1007
1008    """
1009    Initializes sample records in database for a new QC job.
1010
1011    Performs the following functions:
1012        1. Inserts a record for the new instrument run into the "runs" table
1013        2. Inserts sample rows into the "sample_qc_results" table
1014        3. Inserts biological standard sample rows into the "bio_qc_results" table
1015
1016    Args:
1017        run_id (str):
1018            Instrument run ID (job ID)
1019        instrument_id (str):
1020            Instrument ID
1021        chromatography (str):
1022            Chromatography method
1023        bio_standards (str):
1024            Biological standards
1025        path (str):
1026            Data acquisition path
1027        sequence (str):
1028            Acquisition sequence table, as JSON string in "records" format
1029        metadata (str):
1030            Sample metadata table, as JSON string in "records" format
1031        qc_config_id (str):
1032            Name of QC configuration
1033        job_type (str):
1034            Either "completed" or "active"
1035
1036    Returns:
1037        None
1038    """
1039
1040    # Get list of samples from sequence
1041    df_sequence = get_filenames_from_sequence(sequence)
1042
1043    samples = df_sequence["File Name"].astype(str).tolist()
1044    polarities = df_sequence["Polarity"].astype(str).tolist()
1045    positions = df_sequence["Position"].astype(str).tolist()
1046
1047    num_samples = len(samples)
1048
1049    # Connect to database
1050    db_metadata, connection = connect_to_database(instrument_id)
1051
1052    # Get relevant tables
1053    runs_table = sa.Table("runs", db_metadata, autoload=True)
1054    sample_qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1055    bio_qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1056
1057    # Get identifiers for biological standard (if any)
1058    identifiers = get_biological_standard_identifiers(bio_standards)
1059
1060    # Prepare insert of user-inputted run data
1061    insert_run = runs_table.insert().values(
1062        {"run_id": run_id,
1063         "chromatography": chromatography,
1064         "acquisition_path": path,
1065         "sequence": sequence,
1066         "metadata": metadata,
1067         "status": "Active",
1068         "samples": num_samples,
1069         "completed": 0,
1070         "passes": 0,
1071         "fails": 0,
1072         "qc_config_id": qc_config_id,
1073         "biological_standards": str(bio_standards),
1074         "job_type": job_type})
1075
1076    insert_samples = []
1077
1078    for index, sample in enumerate(samples):
1079        # Check if the biological standard identifier is in the sample name
1080        is_bio_standard = False
1081
1082        for identifier in identifiers.keys():
1083            if identifier in sample:
1084                is_bio_standard = True
1085                break
1086
1087        # Prepare insert of the sample row into the "sample_qc_results" table
1088        if not is_bio_standard:
1089            insert_sample = sample_qc_results_table.insert().values(
1090                {"sample_id": sample,
1091                 "run_id": run_id,
1092                 "polarity": polarities[index],
1093                 "position": positions[index]})
1094
1095        # Prepare insert of the sample row into the "bio_qc_results" table
1096        else:
1097            insert_sample = bio_qc_results_table.insert().values(
1098                {"sample_id": sample,
1099                 "run_id": run_id,
1100                 "polarity": polarities[index],
1101                 "biological_standard": identifiers[identifier],
1102                 "position": positions[index]})
1103
1104        # Add this INSERT query into the list of insert queries
1105        insert_samples.append(insert_sample)
1106
1107    # Execute INSERT to database
1108    connection.execute(insert_run)
1109
1110    for insert_sample in insert_samples:
1111        connection.execute(insert_sample)
1112
1113    # Close the connection
1114    connection.close()
1115
1116
1117def get_instrument_run(instrument_id, run_id):
1118
1119    """
1120    Returns DataFrame of given instrument run from "runs" table.
1121
1122    Args:
1123        instrument_id (str): Instrument ID
1124        run_id (str): Run ID
1125
1126    Returns:
1127        DataFrame containing record for instrument run
1128    """
1129
1130    database = get_database_file(instrument_id=instrument_id, sqlite_conn=True)
1131    engine = sa.create_engine(database)
1132    query = "SELECT * FROM runs WHERE run_id = '" + run_id + "'"
1133    df_instrument_run = pd.read_sql(query, engine)
1134    return df_instrument_run
1135
1136
1137def get_instrument_run_from_csv(instrument_id, run_id):
1138
1139    """
1140    Returns DataFrame of selected instrument run from CSV files during active instrument runs.
1141
1142    This function is called when a user views an active instrument run from an external device
1143    (to prevent downloading / uploading the database file with each sample acquisition).
1144
1145    Args:
1146        instrument_id (str): Instrument ID
1147        run_id (str): Run ID
1148
1149    Returns:
1150        DataFrame containing record for instrument run
1151    """
1152
1153    id = instrument_id.replace(" ", "_") + "_" + run_id
1154    run_csv_file = os.path.join(data_directory, id, "csv", "run.csv")
1155    return pd.read_csv(run_csv_file, index_col=False)
1156
1157
1158def get_instrument_runs(instrument_id, as_list=False):
1159
1160    """
1161    Returns DataFrame of all runs on a given instrument from "runs" table
1162
1163    Args:
1164        instrument_id (str):
1165            Instrument ID
1166        as_list (str, default False):
1167            If True, returns only a list of names of instrument runs (jobs)
1168
1169    Returns:
1170        DataFrame containing records for instrument runs (QC jobs) for the given instrument
1171    """
1172
1173    database = get_database_file(instrument_id, sqlite_conn=True)
1174    engine = sa.create_engine(database)
1175    df = pd.read_sql("SELECT * FROM runs", engine)
1176
1177    if as_list:
1178        return df["run_id"].astype(str).tolist()
1179    else:
1180        return df
1181
1182
1183def delete_instrument_run(instrument_id, run_id):
1184
1185    """
1186    Deletes all records for an instrument run (QC job) from the database.
1187
1188    Args:
1189        instrument_id (str): Instrument ID
1190        run_id (str): Run ID
1191
1192    Returns:
1193        None
1194    """
1195
1196    # Connect to database
1197    db_metadata, connection = connect_to_database(instrument_id)
1198
1199    # Get relevant tables
1200    runs_table = sa.Table("runs", db_metadata, autoload=True)
1201    sample_qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1202    bio_qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1203
1204    # Delete from each table
1205    for table in [runs_table, sample_qc_results_table, bio_qc_results_table]:
1206        connection.execute((
1207            sa.delete(table).where(table.c.run_id == run_id)
1208        ))
1209
1210    # Close the connection
1211    connection.close()
1212
1213
1214def get_acquisition_path(instrument_id, run_id):
1215
1216    """
1217    Retrieves acquisition path for a given instrument run.
1218
1219    Args:
1220        instrument_id (str): Instrument ID
1221        run_id (str): Run ID
1222
1223    Returns:
1224        Acquisition path for the given instrument run
1225    """
1226
1227    return get_instrument_run(instrument_id, run_id)["acquisition_path"].astype(str).tolist()[0]
1228
1229
1230def get_md5(instrument_id, sample_id):
1231
1232    """
1233    Returns MD5 checksum for a data file in "sample_qc_results" table.
1234
1235    Used for comparing MD5 checksums during active instrument runs.
1236
1237    TODO: This function will return incorrect results if two different instrument runs
1238        have samples with the same sample ID. It needs to include "run_id" in the SQL query.
1239
1240    Args:
1241        instrument_id (str): Instrument ID
1242        sample_id (str): Sample ID
1243
1244    Returns:
1245        MD5 checksum stored for the data file.
1246    """
1247
1248    # Connect to database
1249    database = get_database_file(instrument_id, sqlite_conn=True)
1250    engine = sa.create_engine(database)
1251
1252    # Check if sample is a biological standard
1253    table = "sample_qc_results"
1254
1255    for identifier in get_biological_standard_identifiers().keys():
1256        if identifier in sample_id:
1257            table = "bio_qc_results"
1258            break
1259
1260    # Get sample from correct table
1261    df_sample_qc_results = pd.read_sql(
1262        "SELECT * FROM " + table + " WHERE sample_id = '" + sample_id + "'", engine)
1263
1264    return df_sample_qc_results["md5"].astype(str).values[0]
1265
1266
1267def update_md5_checksum(instrument_id, sample_id, md5_checksum):
1268
1269    """
1270    Updates MD5 checksum for a data file during sample acquisition.
1271
1272    TODO: This function will return incorrect results if two different instrument runs
1273        have samples with the same sample ID. It needs to include "run_id" in the SQL query.
1274
1275    Args:
1276        instrument_id (str):
1277            Instrument ID
1278        sample_id (str):
1279            Sample ID (filename) of data file
1280        md5_checksum (str):
1281            MD5 checksum for the sample data file
1282
1283    Returns:
1284        None
1285    """
1286
1287    # Connect to database
1288    db_metadata, connection = connect_to_database(instrument_id)
1289
1290    # Check if sample is a biological standard and get relevant table
1291    qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1292
1293    for identifier in get_biological_standard_identifiers().keys():
1294        if identifier in sample_id:
1295            qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1296            break
1297
1298    # Prepare update of MD5 checksum at sample row
1299    update_md5 = (
1300        sa.update(qc_results_table)
1301            .where(qc_results_table.c.sample_id == sample_id)
1302            .values(md5=md5_checksum)
1303    )
1304
1305    # Execute UPDATE into database, then close the connection
1306    connection.execute(update_md5)
1307    connection.close()
1308
1309
1310def write_qc_results(sample_id, instrument_id, run_id, json_mz, json_rt, json_intensity, qc_dataframe, qc_result, is_bio_standard):
1311
1312    """
1313    Writes QC results (as dictionary records) to sample record upon MS-DIAL processing completion.
1314
1315    QC results consist of m/z, RT, and intensity data for internal standards (or targeted metabolites in biological standards),
1316    as well as a DataFrame containing delta m/z, delta RT, in-run delta RT, warnings, and fails (qc_dataframe) and overall QC result
1317    (which will be "Pass" or "Fail").
1318
1319    The data is encoded as dictionary in "records" format: [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}].
1320    This dictionary is cast to a string before being passed to this function.
1321
1322    TODO: Update names of arguments from json_x to record_x, as the data is no longer encoded as JSON strings.
1323        The data is now encoded in "records" format as a string.
1324
1325    Args:
1326        sample_id (str):
1327            Sample ID
1328        instrument_id (str):
1329            Instrument ID
1330        run_id (str):
1331            Instrument run ID (Job ID)
1332        json_mz (str):
1333            String dict of internal standard m/z data in "records" format
1334        json_rt (str):
1335            String dict of internal standard RT data in "records" format
1336        json_intensity (str):
1337            String dict of internal standard intensity data in "records" format
1338        qc_dataframe (str):
1339            String dict of various QC data in "records" format
1340        qc_result (str):
1341            QC result for sample, either "Pass" or "Fail"
1342        is_bio_standard (bool):
1343            Whether the sample is a biological standard
1344
1345    Returns:
1346        None
1347    """
1348
1349    # Connect to database
1350    db_metadata, connection = connect_to_database(instrument_id)
1351
1352    # Get "sample_qc_results" or "bio_qc_results" table
1353    if not is_bio_standard:
1354        qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1355    else:
1356        qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1357
1358    # Prepare update (insert) of QC results to correct sample row
1359    update_qc_results = (
1360        sa.update(qc_results_table)
1361            .where((qc_results_table.c.sample_id == sample_id)
1362                   & (qc_results_table.c.run_id == run_id))
1363            .values(precursor_mz=json_mz,
1364                    retention_time=json_rt,
1365                    intensity=json_intensity,
1366                    qc_dataframe=qc_dataframe,
1367                    qc_result=qc_result)
1368    )
1369
1370    # Execute UPDATE into database, then close the connection
1371    connection.execute(update_qc_results)
1372    connection.close()
1373
1374
1375def get_chromatography_methods():
1376
1377    """
1378    Returns DataFrame of chromatography methods from the Settings database.
1379    """
1380
1381    engine = sa.create_engine(settings_database)
1382    df_methods = pd.read_sql("SELECT * FROM chromatography_methods", engine)
1383    return df_methods
1384
1385
1386def get_chromatography_methods_list():
1387
1388    """
1389    Returns list of chromatography method ID's from the Settings database.
1390    """
1391
1392    df_methods = get_chromatography_methods()
1393    return df_methods["method_id"].astype(str).tolist()
1394
1395
1396def insert_chromatography_method(method_id):
1397
1398    """
1399    Inserts new chromatography method in the "chromatography_methods" table of the Settings database.
1400
1401    Args:
1402        method_id (str): Name of the chromatography method
1403
1404    Returns:
1405        None
1406    """
1407
1408    # Connect to database
1409    db_metadata, connection = connect_to_database("Settings")
1410
1411    # Get "chromatography_methods" table and "biological_standards" table
1412    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1413    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
1414
1415    # Execute insert of chromatography method
1416    insert_method = chromatography_table.insert().values(
1417        {"method_id": method_id,
1418         "num_pos_standards": 0,
1419         "num_neg_standards": 0,
1420         "pos_istd_msp_file": "",
1421         "neg_istd_msp_file": "",
1422         "pos_parameter_file": "",
1423         "neg_parameter_file": "",
1424         "msdial_config_id": "Default"})
1425
1426    connection.execute(insert_method)
1427
1428    # Execute insert of method for each biological standard
1429    df_biological_standards = get_biological_standards()
1430    biological_standards = df_biological_standards["name"].astype(str).unique().tolist()
1431    identifiers = df_biological_standards["identifier"].astype(str).tolist()
1432
1433    for index, biological_standard in enumerate(biological_standards):
1434        insert_method_for_bio_standard = biological_standards_table.insert().values({
1435            "name": biological_standard,
1436            "identifier": identifiers[index],
1437            "chromatography": method_id,
1438            "num_pos_features": 0,
1439            "num_neg_features": 0,
1440            "msdial_config_id": "Default"})
1441        connection.execute(insert_method_for_bio_standard)
1442
1443    # Execute INSERT to database, then close the connection
1444    connection.close()
1445
1446
1447def remove_chromatography_method(method_id):
1448
1449    """
1450    Deletes chromatography method and all associated records from the Settings database.
1451
1452    Details:
1453        1. Removes chromatography method in "chromatography_methods" table
1454        2. Removes method from "biological_standards" table
1455        3. Removes associated internal standards from "internal_standards" table
1456        4. Removes associated targeted features from "targeted_features" table
1457        5. Deletes corresponding MSPs from folders
1458        6. Deletes corresponding MSPs from Google Drive (if sync is enabled)
1459
1460    Args:
1461        method_id (str): Name of the chromatography method
1462
1463    Returns:
1464        None
1465    """
1466
1467    # Delete corresponding MSPs from "methods" directory
1468    df = get_table("Settings", "chromatography_methods")
1469    df = df.loc[df["method_id"] == method_id]
1470
1471    df2 = get_table("Settings", "biological_standards")
1472    df2 = df2.loc[df2["chromatography"] == method_id]
1473
1474    files_to_delete = df["pos_istd_msp_file"].astype(str).tolist() + df["neg_istd_msp_file"].astype(str).tolist() + \
1475        df2["pos_bio_msp_file"].astype(str).tolist() + df2["neg_bio_msp_file"].astype(str).tolist()
1476
1477    for file in os.listdir(methods_directory):
1478        if file in files_to_delete:
1479            os.remove(os.path.join(methods_directory, file))
1480
1481    # Connect to database and get relevant tables
1482    db_metadata, connection = connect_to_database("Settings")
1483    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1484    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
1485    internal_standards_table = sa.Table("internal_standards", db_metadata, autoload=True)
1486    targeted_features_table = sa.Table("targeted_features", db_metadata, autoload=True)
1487
1488    delete_queries = []
1489
1490    # Remove from "chromatography_methods" table
1491    delete_chromatography_method = (
1492        sa.delete(chromatography_table)
1493            .where((chromatography_table.c.method_id == method_id))
1494    )
1495
1496    delete_queries.append(delete_chromatography_method)
1497
1498    # Remove all entries in other tables associated with chromatography
1499    for table in [biological_standards_table, internal_standards_table, targeted_features_table]:
1500        delete_from_table = (
1501            sa.delete(table)
1502                .where((table.c.chromatography == method_id))
1503        )
1504        delete_queries.append(delete_from_table)
1505
1506    # Execute all deletes, then close the connection
1507    for query in delete_queries:
1508        connection.execute(query)
1509
1510    connection.close()
1511
1512
1513def update_msdial_config_for_internal_standards(chromatography, config_id):
1514
1515    """
1516    Updates MS-DIAL configuration for a given chromatography method.
1517
1518    This MS-DIAL configuration will be used to generate a parameters file
1519    for processing samples run with this chromatography method.
1520
1521    Args:
1522        chromatography (str):
1523            Chromatography method ID (name)
1524        config_id (str):
1525            MS-DIAL configuration ID (name)
1526
1527    Returns:
1528        None
1529    """
1530
1531    # Connect to database and get relevant tables
1532    db_metadata, connection = connect_to_database("Settings")
1533    methods_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1534
1535    # Update MS-DIAL configuration for chromatography method
1536    update_msdial_config = (
1537        sa.update(methods_table)
1538            .where(methods_table.c.method_id == chromatography)
1539            .values(msdial_config_id=config_id)
1540    )
1541
1542    connection.execute(update_msdial_config)
1543    connection.close()
1544
1545
1546def add_msp_to_database(msp_file, chromatography, polarity, bio_standard=None):
1547
1548    """
1549    Parses compounds from MSP into the Settings database.
1550
1551    This function writes features from an MSP file into the "internal_standards" or "targeted_features" table,
1552    and inserts location of pos/neg MSP files into "chromatography_methods" table.
1553
1554    TODO: The MSP/TXT libraries have standardized names; there is no need to store the filename in the database.
1555
1556    Args:
1557        msp_file (io.StringIO):
1558            In-memory text-stream file object
1559        chromatography (str):
1560            Chromatography method ID (name)
1561        polarity (str):
1562            Polarity for which MSP should be used for ("Positive Mode" or "Negative Mode")
1563        bio_standard (str, default None):
1564            Parses MSP and applies to biological standard within a chromatography-polarity combination
1565
1566    Returns:
1567        None
1568    """
1569
1570    # Connect to database
1571    db_metadata, connection = connect_to_database("Settings")
1572
1573    # Write MSP file to folder, store file path in database (further down in function)
1574    if not os.path.exists(methods_directory):
1575        os.makedirs(methods_directory)
1576
1577    if bio_standard is not None:
1578        if polarity == "Positive Mode":
1579            filename = bio_standard.replace(" ", "_") + "_" + chromatography + "_Pos.msp"
1580        elif polarity == "Negative Mode":
1581            filename = bio_standard.replace(" ", "_") + "_" + chromatography + "_Neg.msp"
1582    else:
1583        if polarity == "Positive Mode":
1584            filename = chromatography + "_Pos.msp"
1585        elif polarity == "Negative Mode":
1586            filename = chromatography + "_Neg.msp"
1587
1588    msp_file_path = os.path.join(methods_directory, filename)
1589
1590    with open(msp_file_path, "w") as file:
1591        msp_file.seek(0)
1592        shutil.copyfileobj(msp_file, file)
1593
1594    # Read MSP file
1595    with open(msp_file_path, "r") as msp:
1596
1597        list_of_features = []
1598
1599        # Split MSP into list of compounds
1600        data = msp.read().split("\n\n")
1601        data = [element.split("\n") for element in data]
1602
1603        # Add each line of each compound into a list
1604        for feature in data:
1605            if len(feature) != 1:
1606                list_of_features.append(feature)
1607
1608        features_dict = {}
1609        added_features = []
1610
1611        # Iterate through features in MSP
1612        for feature_index, feature in enumerate(list_of_features):
1613
1614            features_dict[feature_index] = {
1615                "Name": None,
1616                "Precursor m/z": None,
1617                "Retention time": None,
1618                "INCHIKEY": None,
1619                "MS2 spectrum": None
1620            }
1621
1622            # Iterate through each line of each feature in the MSP
1623            for data_index, feature_data in enumerate(feature):
1624
1625                # Capture, name, inchikey, m/z, and RT
1626                if "NAME" in feature_data.upper():
1627                    feature_name = feature_data.split(": ")[-1]
1628                    if feature_name not in added_features:
1629                        added_features.append(feature_name)
1630                        features_dict[feature_index]["Name"] = feature_name
1631                        continue
1632                    else:
1633                        break
1634                elif "PRECURSORMZ" in feature_data.upper():
1635                    features_dict[feature_index]["Precursor m/z"] = feature_data.split(": ")[-1]
1636                    continue
1637                elif "INCHIKEY" in feature_data.upper():
1638                    features_dict[feature_index]["INCHIKEY"] = feature_data.split(": ")[-1]
1639                    continue
1640                elif "RETENTIONTIME" in feature_data.upper():
1641                    features_dict[feature_index]["Retention time"] = feature_data.split(": ")[-1]
1642                    continue
1643
1644                # Capture MS2 spectrum
1645                elif "Num Peaks" in feature_data:
1646
1647                    # Get number of peaks in MS2 spectrum
1648                    num_peaks = int(feature_data.split(": ")[-1])
1649
1650                    # Each line in the MSP corresponds to a peak
1651                    start_index = data_index + 1
1652                    end_index = data_index + num_peaks + 1
1653
1654                    # Each peak is represented as a string e.g. "56.04977\t247187"
1655                    peaks_in_spectrum = []
1656                    for peak in feature[start_index:end_index]:
1657                        peaks_in_spectrum.append(peak.replace("\t", ":"))
1658
1659                    features_dict[feature_index]["MS2 spectrum"] = str(peaks_in_spectrum)
1660                    break
1661
1662    features_dict = { key:value for key, value in features_dict.items() if value["Name"] is not None }
1663
1664    # Adding MSP for biological standards
1665    if bio_standard is not None:
1666
1667        # Get "targeted_features" table
1668        targeted_features_table = sa.Table("targeted_features", db_metadata, autoload=True)
1669
1670        # Prepare DELETE of old targeted features
1671        delete_old_targeted_features = (
1672            sa.delete(targeted_features_table)
1673                .where((targeted_features_table.c.chromatography == chromatography)
1674                       & (targeted_features_table.c.polarity == polarity)
1675                       & (targeted_features_table.c.biological_standard == bio_standard))
1676        )
1677
1678        # Execute DELETE
1679        connection.execute(delete_old_targeted_features)
1680
1681        # Execute INSERT of each targeted feature into targeted_features table
1682        for feature in features_dict:
1683            insert_feature = targeted_features_table.insert().values(
1684                {"name": features_dict[feature]["Name"],
1685                 "chromatography": chromatography,
1686                 "polarity": polarity,
1687                 "biological_standard": bio_standard,
1688                 "precursor_mz": features_dict[feature]["Precursor m/z"],
1689                 "retention_time": features_dict[feature]["Retention time"],
1690                 "ms2_spectrum": features_dict[feature]["MS2 spectrum"],
1691                 "inchikey": features_dict[feature]["INCHIKEY"]})
1692            connection.execute(insert_feature)
1693
1694        # Get "biological_standards" table
1695        biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
1696
1697        # Write location of msp file to respective cell
1698        if polarity == "Positive Mode":
1699            update_msp_file = (
1700                sa.update(biological_standards_table)
1701                    .where((biological_standards_table.c.chromatography == chromatography)
1702                           & (biological_standards_table.c.name == bio_standard))
1703                    .values(num_pos_features=len(features_dict),
1704                            pos_bio_msp_file=filename)
1705            )
1706        elif polarity == "Negative Mode":
1707            update_msp_file = (
1708                sa.update(biological_standards_table)
1709                    .where((biological_standards_table.c.chromatography == chromatography)
1710                           & (biological_standards_table.c.name == bio_standard))
1711                    .values(num_neg_features=len(features_dict),
1712                            neg_bio_msp_file=filename)
1713            )
1714
1715        # Execute UPDATE of MSP file location
1716        connection.execute(update_msp_file)
1717
1718    # Adding MSP for internal standards
1719    else:
1720
1721        # Get internal_standards table
1722        internal_standards_table = sa.Table("internal_standards", db_metadata, autoload=True)
1723
1724        # Prepare DELETE of old internal standards
1725        delete_old_internal_standards = (
1726            sa.delete(internal_standards_table)
1727                .where((internal_standards_table.c.chromatography == chromatography)
1728                       & (internal_standards_table.c.polarity == polarity))
1729        )
1730
1731        # Execute DELETE
1732        connection.execute(delete_old_internal_standards)
1733
1734        # Execute INSERT of each internal standard into internal_standards table
1735        for feature in features_dict:
1736            insert_feature = internal_standards_table.insert().values(
1737                {"name": features_dict[feature]["Name"],
1738                 "chromatography": chromatography,
1739                 "polarity": polarity,
1740                 "precursor_mz": features_dict[feature]["Precursor m/z"],
1741                 "retention_time": features_dict[feature]["Retention time"],
1742                 "ms2_spectrum": features_dict[feature]["MS2 spectrum"],
1743                 "inchikey": features_dict[feature]["INCHIKEY"]})
1744            connection.execute(insert_feature)
1745
1746        # Get "chromatography" table
1747        chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1748
1749        # Write location of msp file to respective cell
1750        if polarity == "Positive Mode":
1751            update_msp_file = (
1752                sa.update(chromatography_table)
1753                    .where(chromatography_table.c.method_id == chromatography)
1754                    .values(num_pos_standards=len(features_dict),
1755                            pos_istd_msp_file=filename)
1756            )
1757        elif polarity == "Negative Mode":
1758            update_msp_file = (
1759                sa.update(chromatography_table)
1760                    .where(chromatography_table.c.method_id == chromatography)
1761                    .values(num_neg_standards=len(features_dict),
1762                            neg_istd_msp_file=filename)
1763            )
1764
1765        # Execute UPDATE of MSP file location
1766        connection.execute(update_msp_file)
1767
1768    # If the corresponding TXT library existed, delete it
1769    txt_library = os.path.join(methods_directory, filename.replace(".msp", ".txt"))
1770    os.remove(txt_library) if os.path.exists(txt_library) else None
1771
1772    # Close the connection
1773    connection.close()
1774
1775
1776def add_csv_to_database(csv_file, chromatography, polarity):
1777
1778    """
1779    Parses compounds from a CSV file into the Settings database.
1780
1781    Parses compounds from a CSV into the "internal_standards" table, and stores
1782    the location of the pos/neg TXT files in "chromatography_methods" table.
1783
1784    TODO: The MSP/TXT libraries have standardized names; there is no need to store the filename in the database.
1785
1786    Args:
1787        csv_file (io.StringIO):
1788            In-memory text-stream file object
1789        chromatography (str):
1790            Chromatography method ID (name)
1791        polarity (str):
1792            Polarity for which MSP should be used for ("Positive Mode" or "Negative Mode")
1793
1794    Returns:
1795        None
1796    """
1797
1798    # Convert CSV file into Python dictionary
1799    df_internal_standards = pd.read_csv(csv_file, index_col=False)
1800    internal_standards_dict = df_internal_standards.to_dict("index")
1801
1802    # Create methods directory if it doesn't already exist
1803    if not os.path.exists(methods_directory):
1804        os.makedirs(methods_directory)
1805
1806    # Name file accordingly
1807    if polarity == "Positive Mode":
1808        filename = chromatography + "_Pos.txt"
1809    elif polarity == "Negative Mode":
1810        filename = chromatography + "_Neg.txt"
1811
1812    txt_file_path = os.path.join(methods_directory, filename)
1813
1814    # Write CSV columns to tab-delimited text file
1815    df_internal_standards.to_csv(txt_file_path, sep="\t", index=False)
1816
1817    # Connect to database
1818    db_metadata, connection = connect_to_database("Settings")
1819
1820    # Get internal_standards table
1821    internal_standards_table = sa.Table("internal_standards", db_metadata, autoload=True)
1822
1823    # Prepare DELETE of old internal standards
1824    delete_old_internal_standards = (
1825        sa.delete(internal_standards_table)
1826            .where((internal_standards_table.c.chromatography == chromatography)
1827                   & (internal_standards_table.c.polarity == polarity))
1828    )
1829
1830    # Execute DELETE
1831    connection.execute(delete_old_internal_standards)
1832
1833    # Execute INSERT of each internal standard into internal_standards table
1834    for row in internal_standards_dict.keys():
1835        insert_standard = internal_standards_table.insert().values(
1836            {"name": internal_standards_dict[row]["Common Name"],
1837             "chromatography": chromatography,
1838             "polarity": polarity,
1839             "precursor_mz": internal_standards_dict[row]["MS1 m/z"],
1840             "retention_time": internal_standards_dict[row]["RT (min)"]})
1841        connection.execute(insert_standard)
1842
1843    # Get "chromatography" table
1844    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1845
1846    # Write location of CSV file to respective cell
1847    if polarity == "Positive Mode":
1848        update_msp_file = (
1849            sa.update(chromatography_table)
1850                .where(chromatography_table.c.method_id == chromatography)
1851                .values(num_pos_standards=len(internal_standards_dict),
1852                        pos_istd_msp_file=filename)
1853        )
1854    elif polarity == "Negative Mode":
1855        update_msp_file = (
1856            sa.update(chromatography_table)
1857                .where(chromatography_table.c.method_id == chromatography)
1858                .values(num_neg_standards=len(internal_standards_dict),
1859                        neg_istd_msp_file=filename)
1860        )
1861
1862    # Execute UPDATE of CSV file location
1863    connection.execute(update_msp_file)
1864
1865    # If the corresponding MSP library existed, delete it
1866    msp_library = os.path.join(methods_directory, filename.replace(".txt", ".msp"))
1867    os.remove(msp_library) if os.path.exists(msp_library) else None
1868
1869    # Close the connection
1870    connection.close()
1871
1872
1873def get_msdial_configurations():
1874
1875    """
1876    Returns list of user configurations of MS-DIAL parameters from Settings database.
1877    """
1878
1879    engine = sa.create_engine(settings_database)
1880    df_msdial_configurations = pd.read_sql("SELECT * FROM msdial_parameters", engine)
1881    return df_msdial_configurations["config_name"].astype(str).tolist()
1882
1883
1884def generate_msdial_parameters_file(chromatography, polarity, msp_file_path, bio_standard=None):
1885
1886    """
1887    Uses parameters from user-curated MS-DIAL configuration to create a parameters.txt file for MS-DIAL.
1888
1889    TODO: Currently, this function is only called upon a new job setup. To allow changes during a QC job,
1890        this function should be called every time the user makes a configuration save in Settings > MS-DIAL Configurations.
1891
1892    Args:
1893        chromatography (str):
1894            Chromatography method ID (name)
1895        polarity (str):
1896            Polarity ("Positive" or "Negative")
1897        msp_file_path (str):
1898            MSP library file path
1899        bio_standard (str, default None):
1900            Specifies that the parameters file is for a biological standard
1901
1902    Returns:
1903        None
1904    """
1905
1906    # Get parameters of selected configuration
1907    if bio_standard is not None:
1908        df_bio_standards = get_biological_standards()
1909        df_bio_standards = df_bio_standards.loc[
1910            (df_bio_standards["chromatography"] == chromatography) & (df_bio_standards["name"] == bio_standard)]
1911        config_name = df_bio_standards["msdial_config_id"].astype(str).values[0]
1912    else:
1913        df_methods = get_chromatography_methods()
1914        df_methods = df_methods.loc[df_methods["method_id"] == chromatography]
1915        config_name = df_methods["msdial_config_id"].astype(str).values[0]
1916
1917    parameters = get_msdial_configuration_parameters(config_name)
1918
1919    # Create "methods" directory if it does not exist
1920    if not os.path.exists(methods_directory):
1921        os.makedirs(methods_directory)
1922
1923    # Name parameters file accordingly
1924    if bio_standard is not None:
1925        if polarity == "Positive":
1926            filename = bio_standard.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Pos.txt"
1927        elif polarity == "Negative":
1928            filename = bio_standard.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Neg.txt"
1929    else:
1930        if polarity == "Positive":
1931            filename = chromatography.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Pos.txt"
1932        elif polarity == "Negative":
1933            filename = chromatography.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Neg.txt"
1934
1935    parameters_file = os.path.join(methods_directory, filename)
1936
1937    # Some specifications based on polarity / file type for the parameters
1938    if polarity == "Positive":
1939        adduct_type = "[M+H]+"
1940    elif polarity == "Negative":
1941        adduct_type = "[M-H]-"
1942
1943    if msp_file_path.endswith(".msp"):
1944        filepath = "MSP file: " + msp_file_path
1945    elif msp_file_path.endswith(".txt"):
1946        filepath = "Text file: " + msp_file_path
1947
1948    # Text file contents
1949    lines = [
1950        "#Data type",
1951        "MS1 data type: Centroid",
1952        "MS2 data type: Centroid",
1953        "Ion mode: " + polarity,
1954        "DIA file:", "\n"
1955
1956        "#Data collection parameters",
1957        "Retention time begin: " + str(parameters[0]),
1958        "Retention time end: " + str(parameters[1]),
1959        "Mass range begin: " + str(parameters[2]),
1960        "Mass range end: " + str(parameters[3]), "\n",
1961
1962        "#Centroid parameters",
1963        "MS1 tolerance for centroid: " + str(parameters[4]),
1964        "MS2 tolerance for centroid: " + str(parameters[5]), "\n",
1965
1966        "#Peak detection parameters",
1967        "Smoothing method: " + str(parameters[6]),
1968        "Smoothing level: " + str(parameters[7]),
1969        "Minimum peak width: " + str(parameters[8]),
1970        "Minimum peak height: " + str(parameters[9]),
1971        "Mass slice width: " + str(parameters[10]), "\n",
1972
1973        "#Deconvolution parameters",
1974        "Sigma window value: 0.5",
1975        "Amplitude cut off: 0", "\n",
1976
1977        "#Adduct list",
1978        "Adduct list: " + adduct_type, "\n",
1979
1980        "#Text file and post identification (retention time and accurate mass based) setting",
1981        filepath,
1982        "Retention time tolerance for post identification: " + str(parameters[11]),
1983        "Accurate ms1 tolerance for post identification: " + str(parameters[12]),
1984        "Post identification score cut off: " + str(parameters[13]), "\n",
1985
1986        "#Alignment parameters setting",
1987        "Retention time tolerance for alignment: " + str(parameters[14]),
1988        "MS1 tolerance for alignment: " + str(parameters[15]),
1989        "Retention time factor for alignment: " + str(parameters[16]),
1990        "MS1 factor for alignment: " + str(parameters[17]),
1991        "Peak count filter: " + str(parameters[18]),
1992        "QC at least filter: " + str(parameters[19]),
1993    ]
1994
1995    # Write parameters to a text file
1996    with open(parameters_file, "w") as file:
1997        for line in lines:
1998            file.write(line)
1999            if line != "\n":
2000                file.write("\n")
2001
2002    # Write path of parameters text file to chromatography method in database
2003    db_metadata, connection = connect_to_database("Settings")
2004    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
2005    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2006
2007    # For processing biological standard samples
2008    if bio_standard is not None:
2009        if polarity == "Positive":
2010            update_parameter_file = (
2011                sa.update(biological_standards_table)
2012                    .where((biological_standards_table.c.chromatography == chromatography)
2013                           & (biological_standards_table.c.name == bio_standard))
2014                    .values(pos_parameter_file=parameters_file)
2015            )
2016        elif polarity == "Negative":
2017            update_parameter_file = (
2018                sa.update(biological_standards_table)
2019                    .where((biological_standards_table.c.chromatography == chromatography)
2020                           & (biological_standards_table.c.name == bio_standard))
2021                    .values(neg_parameter_file=parameters_file)
2022            )
2023    # For processing samples with internal standards
2024    else:
2025        if polarity == "Positive":
2026            update_parameter_file = (
2027                sa.update(chromatography_table)
2028                    .where(chromatography_table.c.method_id == chromatography)
2029                    .values(pos_parameter_file=parameters_file)
2030            )
2031        elif polarity == "Negative":
2032            update_parameter_file = (
2033                sa.update(chromatography_table)
2034                    .where(chromatography_table.c.method_id == chromatography)
2035                    .values(neg_parameter_file=parameters_file)
2036            )
2037
2038    connection.execute(update_parameter_file)
2039    connection.close()
2040
2041
2042def add_msdial_configuration(msdial_config_name):
2043
2044    """
2045    Inserts new user configuration of MS-DIAL parameters into the "msdial_parameters" table in Settings database.
2046
2047    Args:
2048        msdial_config_name (str): MS-DIAL configuration ID
2049
2050    Returns:
2051        None
2052    """
2053
2054    # Connect to database
2055    db_metadata, connection = connect_to_database("Settings")
2056
2057    # Get MS-DIAL parameters table
2058    msdial_parameters_table = sa.Table("msdial_parameters", db_metadata, autoload=True)
2059
2060    # Prepare insert of user-inputted run data
2061    insert_config = msdial_parameters_table.insert().values(
2062        {"config_name": msdial_config_name,
2063         "rt_begin": 0,
2064         "rt_end": 100,
2065         "mz_begin": 0,
2066         "mz_end": 2000,
2067         "ms1_centroid_tolerance": 0.008,
2068         "ms2_centroid_tolerance": 0.01,
2069         "smoothing_method": "LinearWeightedMovingAverage",
2070         "smoothing_level": 3,
2071         "min_peak_width": 3,
2072         "min_peak_height": 35000,
2073         "mass_slice_width": 0.1,
2074         "post_id_rt_tolerance": 0.3,
2075         "post_id_mz_tolerance": 0.008,
2076         "post_id_score_cutoff": 85,
2077         "alignment_rt_tolerance": 0.05,
2078         "alignment_mz_tolerance": 0.008,
2079         "alignment_rt_factor": 0.5,
2080         "alignment_mz_factor": 0.5,
2081         "peak_count_filter": 0,
2082         "qc_at_least_filter": "True"}
2083    )
2084
2085    # Execute INSERT to database, then close the connection
2086    connection.execute(insert_config)
2087    connection.close()
2088
2089
2090def remove_msdial_configuration(msdial_config_name):
2091
2092    """
2093    Deletes user configuration of MS-DIAL parameters from the "msdial_parameters" table.
2094
2095    Args:
2096        msdial_config_name (str): MS-DIAL configuration ID
2097
2098    Returns:
2099        None
2100    """
2101
2102    # Connect to database
2103    db_metadata, connection = connect_to_database("Settings")
2104
2105    # Get MS-DIAL parameters table
2106    msdial_parameters_table = sa.Table("msdial_parameters", db_metadata, autoload=True)
2107
2108    # Prepare DELETE of MS-DIAL configuration
2109    delete_config = (
2110        sa.delete(msdial_parameters_table)
2111            .where(msdial_parameters_table.c.config_name == msdial_config_name)
2112    )
2113
2114    # Execute DELETE, then close the connection
2115    connection.execute(delete_config)
2116    connection.close()
2117
2118
2119def get_msdial_configuration_parameters(msdial_config_name, parameter=None):
2120
2121    """
2122    Returns tuple of parameters defined for a selected MS-DIAL configuration.
2123
2124    TODO: The MS-DIAL configuration is returned as a tuple for a concise implementation of get_msdial_parameters_for_config()
2125        in the DashWebApp module. While convenient there, this function is not optimal for maintainability. Should return
2126        the entire DataFrame record instead.
2127
2128    See update_msdial_configuration() for details on parameters.
2129
2130    Args:
2131        msdial_config_name (str):
2132            MS-DIAL configuration ID
2133        parameter (str, default None):
2134            If specified, returns only the value for the given parameter
2135
2136    Returns:
2137        Tuple of parameters for the given MS-DIAL configuration, or single parameter value.
2138    """
2139
2140    # Get "msdial_parameters" table from database as a DataFrame
2141    engine = sa.create_engine(settings_database)
2142    df_configurations = pd.read_sql("SELECT * FROM msdial_parameters", engine)
2143
2144    # Get selected configuration
2145    selected_config = df_configurations.loc[
2146        df_configurations["config_name"] == msdial_config_name]
2147
2148    selected_config.drop(["id", "config_name"], inplace=True, axis=1)
2149
2150    if parameter is not None:
2151        return selected_config[parameter].values[0]
2152    else:
2153        return tuple(selected_config.to_records(index=False)[0])
2154
2155
2156def update_msdial_configuration(config_name, rt_begin, rt_end, mz_begin, mz_end, ms1_centroid_tolerance,
2157    ms2_centroid_tolerance, smoothing_method, smoothing_level, mass_slice_width, min_peak_width, min_peak_height,
2158    post_id_rt_tolerance, post_id_mz_tolerance, post_id_score_cutoff, alignment_rt_tolerance, alignment_mz_tolerance,
2159    alignment_rt_factor, alignment_mz_factor, peak_count_filter, qc_at_least_filter):
2160
2161    """
2162    Updates and saves changes of all parameters for a selected MS-DIAL configuration.
2163
2164    For details on MS-DIAL parameters, see: https://mtbinfo-team.github.io/mtbinfo.github.io/MS-DIAL/tutorial#section-2-3
2165
2166    Args:
2167        config_name (str):
2168            Name / ID of MS-DIAL configuration
2169        rt_begin (int):
2170            Minimum retention time in RT range for analysis range
2171        rt_end (int):
2172            Maximum retention time in RT range for analysis
2173        mz_begin (float):
2174            Minimum precursor mass in m/z range for analysis range
2175        mz_end (float):
2176            Maximum precursor mass in m/z range for analysis range
2177        ms1_centroid_tolerance (float):
2178            MS1 centroid tolerance
2179        ms2_centroid_tolerance (float):
2180            MS2 centroid tolerance
2181        smoothing_method (str):
2182            Peak smoothing method for peak detection
2183        smoothing_level (int):
2184            Peak smoothing level
2185        mass_slice_width (float):
2186            Mass slice width
2187        min_peak_width (int):
2188            Minimum peak width threshold
2189        min_peak_height (int):
2190            Minimum peak height threshold
2191        post_id_rt_tolerance (float):
2192            Post-identification retention time tolerance
2193        post_id_mz_tolerance (float):
2194            Post-identification precursor m/z tolerance
2195        post_id_score_cutoff (int):
2196            Similarity score cutoff after peak identification
2197        alignment_rt_tolerance (float):
2198            Post-alignment retention time tolerance
2199        alignment_mz_tolerance (float):
2200            Post-alignment precursor m/z tolerance
2201        alignment_rt_factor (float):
2202            Post-alignment retention time factor
2203        alignment_mz_factor (float):
2204            Post-alignment precursor m/z tolerance
2205        peak_count_filter (int):
2206            Peak count filter
2207        qc_at_least_filter (str):
2208            QC at least filter
2209
2210    Returns:
2211        None
2212    """
2213
2214    # Connect to database
2215    db_metadata, connection = connect_to_database("Settings")
2216
2217    # Get MS-DIAL parameters table
2218    msdial_parameters_table = sa.Table("msdial_parameters", db_metadata, autoload=True)
2219
2220    # Prepare insert of user-inputted MS-DIAL parameters
2221    update_parameters = (
2222        sa.update(msdial_parameters_table)
2223            .where(msdial_parameters_table.c.config_name == config_name)
2224            .values(rt_begin=rt_begin,
2225                    rt_end=rt_end,
2226                    mz_begin=mz_begin,
2227                    mz_end=mz_end,
2228                    ms1_centroid_tolerance=ms1_centroid_tolerance,
2229                    ms2_centroid_tolerance=ms2_centroid_tolerance,
2230                    smoothing_method=smoothing_method,
2231                    smoothing_level=smoothing_level,
2232                    min_peak_width=min_peak_width,
2233                    min_peak_height=min_peak_height,
2234                    mass_slice_width=mass_slice_width,
2235                    post_id_rt_tolerance=post_id_rt_tolerance,
2236                    post_id_mz_tolerance=post_id_mz_tolerance,
2237                    post_id_score_cutoff=post_id_score_cutoff,
2238                    alignment_rt_tolerance=alignment_rt_tolerance,
2239                    alignment_mz_tolerance=alignment_mz_tolerance,
2240                    alignment_rt_factor=alignment_rt_factor,
2241                    alignment_mz_factor=alignment_mz_factor,
2242                    peak_count_filter=peak_count_filter,
2243                    qc_at_least_filter=qc_at_least_filter)
2244    )
2245
2246    # Execute UPDATE to database, then close the connection
2247    connection.execute(update_parameters)
2248    connection.close()
2249
2250
2251def get_msp_file_path(chromatography, polarity, bio_standard=None):
2252
2253    """
2254    Returns file paths of MSPs for a selected chromatography / polarity (both stored
2255    in the methods folder upon user upload) for MS-DIAL parameter file generation.
2256
2257    TODO: Once added to workspace, MSP / TXT library file names are standardized. No need to store / retrieve from database.
2258        Get the file path using the filename e.g. return directory + chromatography + "_" + polarity + ".msp".
2259
2260    Args:
2261        chromatography (str):
2262            Chromatography method ID
2263        polarity (str):
2264            Polarity, either "Positive" or "Negative"
2265        bio_standard (str, default None):
2266            Name of biological standard
2267
2268    Returns:
2269        MSP / TXT library file path.
2270    """
2271
2272    # Connect to database
2273    engine = sa.create_engine(settings_database)
2274
2275    if bio_standard is not None:
2276        # Get selected biological standard
2277        query = "SELECT * FROM biological_standards WHERE name = '" + bio_standard + "' AND chromatography='" + chromatography + "'"
2278        df_biological_standards = pd.read_sql(query, engine)
2279
2280        # Get file path of MSP in requested polarity
2281        if polarity == "Positive":
2282            msp_file_path = df_biological_standards["pos_bio_msp_file"].astype(str).values[0]
2283        elif polarity == "Negative":
2284            msp_file_path = df_biological_standards["neg_bio_msp_file"].astype(str).values[0]
2285
2286    else:
2287        # Get selected chromatography method
2288        query = "SELECT * FROM chromatography_methods WHERE method_id='" + chromatography + "'"
2289        df_methods = pd.read_sql(query, engine)
2290
2291        # Get file path of MSP in requested polarity
2292        if polarity == "Positive":
2293            msp_file_path = df_methods["pos_istd_msp_file"].astype(str).values[0]
2294        elif polarity == "Negative":
2295            msp_file_path = df_methods["neg_istd_msp_file"].astype(str).values[0]
2296
2297    msp_file_path = os.path.join(methods_directory, msp_file_path)
2298
2299    # Return file path
2300    return msp_file_path
2301
2302
2303def get_parameter_file_path(chromatography, polarity, biological_standard=None):
2304
2305    """
2306    Returns file path of parameters file stored in database.
2307
2308    TODO: Once generated, MS-DIAL parameter filenames are standardized. No need to store / retrieve from database.
2309        Get the file path using the filename e.g. return directory + chromatography + "_" + polarity + "_Parameters.txt".
2310
2311    Args:
2312        chromatography (str):
2313            Chromatography method ID
2314        polarity (str):
2315            Polarity, either "Positive" or "Negative"
2316        bio_standard (str, default None):
2317            Name of biological standard
2318
2319    Returns:
2320        File path for MS-DIAL parameters.txt file.
2321    """
2322
2323    engine = sa.create_engine(settings_database)
2324
2325    if biological_standard is not None:
2326        query = "SELECT * FROM biological_standards WHERE chromatography='" + chromatography + \
2327                "' AND name ='" + biological_standard + "'"
2328    else:
2329        query = "SELECT * FROM chromatography_methods WHERE method_id='" + chromatography + "'"
2330
2331    df = pd.read_sql(query, engine)
2332
2333    if polarity == "Pos":
2334        parameter_file = df["pos_parameter_file"].astype(str).values[0]
2335    elif polarity == "Neg":
2336        parameter_file = df["neg_parameter_file"].astype(str).values[0]
2337
2338    return parameter_file
2339
2340
2341def get_msdial_directory():
2342
2343    """
2344    Returns location of MS-DIAL directory.
2345    """
2346
2347    return get_table("Settings", "workspace")["msdial_directory"].astype(str).values[0]
2348
2349
2350def get_msconvert_directory():
2351
2352    """
2353    Returns location of MSConvert directory.
2354
2355    This function uses the MS-DIAL directory path to retrieve user ID, which it then uses to
2356    retrieve the path for MSConvert.exe in C:/Users/<username>/AppData/Local/Apps.
2357
2358    TODO: There is probably a better way to implement this.
2359
2360    Returns:
2361        Location of MSConvert directory in C:/Users/<username>/AppData/Local/Apps/ProteoWizard.
2362    """
2363
2364    user = get_msdial_directory().replace("\\", "/").split("/")[2]
2365    msconvert_folder = [f.path for f in os.scandir("C:/Users/" + user + "/AppData/Local/Apps/") if f.is_dir() and "ProteoWizard" in f.name][0]
2366    return msconvert_folder
2367
2368
2369def update_msdial_directory(msdial_directory):
2370
2371    """
2372    Updates location of MS-DIAL directory, stored in "workspace" table of the Settings database.
2373
2374    Args:
2375        msdial_directory (str): New MS-DIAL directory location
2376
2377    Returns:
2378        None
2379    """
2380
2381    db_metadata, connection = connect_to_database("Settings")
2382    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
2383
2384    update_msdial_directory = (
2385        sa.update(workspace_table)
2386            .where(workspace_table.c.id == 1)
2387            .values(msdial_directory=msdial_directory)
2388    )
2389
2390    connection.execute(update_msdial_directory)
2391    connection.close()
2392
2393
2394def get_internal_standards_dict(chromatography, value_type):
2395
2396    """
2397    Returns dictionary of internal standard keys mapped to either m/z or RT values.
2398
2399    This function is used to establish a y-axis range for internal standard retention time plots.
2400    See load_istd_rt_plot() in the PlotGeneration module.
2401
2402    TODO: This function needs to filter for polarity!
2403
2404    Args:
2405        chromatography (str):
2406            Chromatography method to retrieve internal standards for
2407        value_type (str):
2408            Data type ("precursor_mz", "retention_time", "ms2_spectrum")
2409
2410    Returns:
2411        Dictionary with key-value pairs of { internal_standard: value_type }
2412    """
2413
2414    engine = sa.create_engine(settings_database)
2415    query = "SELECT * FROM internal_standards " + "WHERE chromatography='" + chromatography + "'"
2416    df_internal_standards = pd.read_sql(query, engine)
2417
2418    dict = {}
2419    keys = df_internal_standards["name"].astype(str).tolist()
2420    values = df_internal_standards[value_type].astype(float).tolist()
2421
2422    for index, key in enumerate(keys):
2423        dict[key] = values[index]
2424
2425    return dict
2426
2427
2428def get_internal_standards(chromatography, polarity):
2429
2430    """
2431    Returns DataFrame of internal standards for a given chromatography method and polarity.
2432
2433    Args:
2434        chromatography (str):
2435            Chromatography method ID
2436        polarity (str):
2437            Polarity (either "Pos" or "Neg")
2438
2439    Returns:
2440        DataFrame of "internal_standards" table from Settings database, filtered by chromatography and polarity.
2441    """
2442
2443    if polarity == "Pos":
2444        polarity = "Positive Mode"
2445    elif polarity == "Neg":
2446        polarity = "Negative Mode"
2447
2448    engine = sa.create_engine(settings_database)
2449
2450    query = "SELECT * FROM internal_standards " + \
2451            "WHERE chromatography='" + chromatography + "' AND polarity='" + polarity + "'"
2452
2453    return pd.read_sql(query, engine)
2454
2455
2456def get_targeted_features(biological_standard, chromatography, polarity):
2457
2458    """
2459    Returns DataFrame of metabolite targets for a given biological standard, chromatography, and polarity.
2460
2461    Args:
2462        biological_standard (str):
2463            Name of biological standard
2464        chromatography (str):
2465            Chromatography method ID (name)
2466        polarity (str):
2467            Polarity (either "Pos" or "Neg")
2468
2469    Returns:
2470        DataFrame of "targeted_features" table from Settings database, filtered by chromatography and polarity.
2471    """
2472
2473    if polarity == "Pos":
2474        polarity = "Positive Mode"
2475    elif polarity == "Neg":
2476        polarity = "Negative Mode"
2477
2478    engine = sa.create_engine(settings_database)
2479
2480    query = "SELECT * FROM targeted_features " + \
2481            "WHERE chromatography='" + chromatography + \
2482            "' AND polarity='" + polarity + \
2483            "' AND biological_standard ='" + biological_standard + "'"
2484
2485    return pd.read_sql(query, engine)
2486
2487
2488def get_biological_standards():
2489
2490    """
2491    Returns DataFrame of the "biological_standards" table from the Settings database.
2492    """
2493
2494    # Get table from database as a DataFrame
2495    engine = sa.create_engine(settings_database)
2496    df_biological_standards = pd.read_sql("SELECT * FROM biological_standards", engine)
2497    return df_biological_standards
2498
2499
2500def get_biological_standards_list():
2501
2502    """
2503    Returns list of biological standards from the Settings database.
2504    """
2505
2506    df_biological_standards = get_biological_standards()
2507    return df_biological_standards["name"].astype(str).unique().tolist()
2508
2509
2510def add_biological_standard(name, identifier):
2511
2512    """
2513    Creates new biological standard with name and identifier.
2514
2515    The biological standard identifier is a text substring used to distinguish between sample and biological standard.
2516    MS-AutoQC checks filenames in the sequence for this identifier to process samples accordingly.
2517
2518    Args:
2519        name (str):
2520            Name of biological standard
2521        identifier (str):
2522            String identifier in filename for biological standard
2523
2524    Returns:
2525        None
2526    """
2527
2528    # Get list of chromatography methods
2529    chromatography_methods = get_chromatography_methods()["method_id"].tolist()
2530
2531    # Connect to database and get "biological_standards" table
2532    db_metadata, connection = connect_to_database("Settings")
2533    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2534
2535    # Insert a biological standard row for each chromatography
2536    for method in chromatography_methods:
2537        insert = biological_standards_table.insert().values({
2538            "name": name,
2539            "identifier": identifier,
2540            "chromatography": method,
2541            "num_pos_features": 0,
2542            "num_neg_features": 0,
2543            "msdial_config_id": "Default"
2544        })
2545        connection.execute(insert)
2546
2547    # Close the connection
2548    connection.close()
2549
2550
2551def remove_biological_standard(name):
2552
2553    """
2554    Deletes biological standard and corresponding MSPs from Settings database.
2555
2556    Args:
2557        name (str): Name of the biological standard
2558
2559    Returns:
2560        None
2561    """
2562
2563    # Delete corresponding MSPs from "methods" directory
2564    df = get_table("Settings", "biological_standards")
2565    df = df.loc[df["name"] == name]
2566    files_to_delete = df["pos_bio_msp_file"].astype(str).tolist() + df["neg_bio_msp_file"].astype(str).tolist()
2567
2568    for file in os.listdir(methods_directory):
2569        if name in files_to_delete:
2570            os.remove(os.path.join(methods_directory, file))
2571
2572    # Connect to database and get relevant tables
2573    db_metadata, connection = connect_to_database("Settings")
2574    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2575    targeted_features_table = sa.Table("targeted_features", db_metadata, autoload=True)
2576
2577    # Remove biological standard
2578    delete_biological_standard = (
2579        sa.delete(biological_standards_table)
2580            .where((biological_standards_table.c.name == name))
2581    )
2582    connection.execute(delete_biological_standard)
2583
2584    # Remove targeted features for that biological standard
2585    delete_targeted_features = (
2586        sa.delete(targeted_features_table)
2587            .where((targeted_features_table.c.biological_standard == name))
2588    )
2589    connection.execute(delete_targeted_features)
2590
2591    # Close the connection
2592    connection.close()
2593
2594
2595def update_msdial_config_for_bio_standard(biological_standard, chromatography, config_id):
2596
2597    """
2598    Updates MS-DIAL configuration for given biological standard and chromatography method combination.
2599
2600    Args:
2601        biological_standard (str):
2602            Name of the biological standard
2603        chromatography (str):
2604            Chromatography method
2605        config_id (str):
2606            Name of MS-DIAL configuration to set for this biological standard - chromatography combination
2607
2608    Returns:
2609        None
2610    """
2611
2612    # Connect to database and get relevant tables
2613    db_metadata, connection = connect_to_database("Settings")
2614    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2615
2616    # Update MS-DIAL configuration for biological standard
2617    update_msdial_config = (
2618        sa.update(biological_standards_table)
2619            .where((biological_standards_table.c.name == biological_standard)
2620                   & (biological_standards_table.c.chromatography == chromatography))
2621            .values(msdial_config_id=config_id)
2622    )
2623
2624    connection.execute(update_msdial_config)
2625    connection.close()
2626
2627
2628def get_biological_standard_identifiers(bio_standards=None):
2629
2630    """
2631    Returns dictionary of identifiers for a given list of biological standards.
2632
2633    If no list is provided, returns dict of identifiers for all biological standards.
2634
2635    Args:
2636        bio_standards (list, default None): List of biological standards
2637
2638    Returns:
2639        Dictionary with key-value pairs of { identifier: biological_standard }
2640    """
2641
2642    df_bio_standards = get_biological_standards()
2643
2644    identifiers = {}
2645
2646    if bio_standards is not None:
2647        if len(bio_standards) > 0:
2648            for bio_standard in bio_standards:
2649                df = df_bio_standards.loc[df_bio_standards["name"] == bio_standard]
2650                identifier = df["identifier"].astype(str).unique().tolist()[0]
2651                identifiers[identifier] = bio_standard
2652    else:
2653        names = df_bio_standards["name"].astype(str).unique().tolist()
2654        ids = df_bio_standards["identifier"].astype(str).unique().tolist()
2655        for index, name in enumerate(names):
2656            identifiers[ids[index]] = names[index]
2657
2658    return identifiers
2659
2660
2661def get_qc_configurations():
2662
2663    """
2664    Returns DataFrame of "qc_parameters" table from Settings database.
2665    """
2666
2667    engine = sa.create_engine(settings_database)
2668    return pd.read_sql("SELECT * FROM qc_parameters", engine)
2669
2670
2671def get_qc_configurations_list():
2672
2673    """
2674    Returns list of names of QC configurations from Settings database.
2675    """
2676
2677    return get_qc_configurations()["config_name"].astype(str).tolist()
2678
2679
2680def add_qc_configuration(qc_config_name):
2681
2682    """
2683    Adds a new QC configuration to the "qc_parameters" table in the Settings database.
2684
2685    Args:
2686        qc_config_name (str): Name of the QC configuration
2687
2688    Returns:
2689        None
2690    """
2691
2692    # Connect to database
2693    db_metadata, connection = connect_to_database("Settings")
2694
2695    # Get QC parameters table
2696    qc_parameters_table = sa.Table("qc_parameters", db_metadata, autoload=True)
2697
2698    # Prepare insert of user-inputted run data
2699    insert_config = qc_parameters_table.insert().values(
2700        {"config_name": qc_config_name,
2701         "intensity_dropouts_cutoff": 4,
2702         "library_rt_shift_cutoff": 0.1,
2703         "in_run_rt_shift_cutoff": 0.05,
2704         "library_mz_shift_cutoff": 0.005,
2705         "intensity_enabled": True,
2706         "library_rt_enabled": True,
2707         "in_run_rt_enabled": True,
2708         "library_mz_enabled": True}
2709    )
2710
2711    # Execute INSERT to database, then close the connection
2712    connection.execute(insert_config)
2713    connection.close()
2714
2715
2716def remove_qc_configuration(qc_config_name):
2717
2718    """
2719    Deletes QC configuration from the "qc_parameters" table in the Settings database.
2720
2721    Args:
2722        qc_config_name (str): Name of the QC configuration
2723
2724    Returns:
2725        None
2726    """
2727
2728    # Connect to database
2729    db_metadata, connection = connect_to_database("Settings")
2730
2731    # Get QC parameters table
2732    qc_parameters_table = sa.Table("qc_parameters", db_metadata, autoload=True)
2733
2734    # Prepare DELETE of MS-DIAL configuration
2735    delete_config = (
2736        sa.delete(qc_parameters_table)
2737            .where(qc_parameters_table.c.config_name == qc_config_name)
2738    )
2739
2740    # Execute DELETE, then close the connection
2741    connection.execute(delete_config)
2742    connection.close()
2743
2744
2745def get_qc_configuration_parameters(config_name=None, instrument_id=None, run_id=None):
2746
2747    """
2748    Returns DataFrame of parameters for a selected QC configuration.
2749
2750    The DataFrame has columns for each parameter, as well as for whether the parameter is enabled.
2751
2752    Args:
2753        config_name (str, default None):
2754            Name of QC configuration
2755        instrument_id (str, default None):
2756            Instrument ID (name)
2757        run_id (str, default None):
2758            Instrument run ID (job ID)
2759
2760    Returns:
2761        DataFrame of parameters for QC configuration.
2762    """
2763
2764    df_configurations = get_table("Settings", "qc_parameters")
2765
2766    # Get selected configuration
2767    if config_name is not None:
2768        selected_config = df_configurations.loc[df_configurations["config_name"] == config_name]
2769
2770    elif instrument_id is not None and run_id is not None:
2771        df_runs = get_table(instrument_id, "runs")
2772        config_name = df_runs.loc[df_runs["run_id"] == run_id]["qc_config_id"].values[0]
2773        selected_config = df_configurations.loc[
2774            df_configurations["config_name"] == config_name]
2775
2776    selected_config.drop(inplace=True, columns=["id", "config_name"])
2777
2778    # Probably not the most efficient way to do this...
2779    for column in ["intensity_enabled", "library_rt_enabled", "in_run_rt_enabled", "library_mz_enabled"]:
2780        selected_config.loc[selected_config[column] == 1, column] = True
2781        selected_config.loc[selected_config[column] == 0, column] = False
2782
2783    # Return parameters of selected configuration as a tuple
2784    return selected_config
2785
2786
2787def update_qc_configuration(config_name, intensity_dropouts_cutoff, library_rt_shift_cutoff, in_run_rt_shift_cutoff,
2788    library_mz_shift_cutoff, intensity_enabled, library_rt_enabled, in_run_rt_enabled, library_mz_enabled):
2789
2790    """
2791    Updates parameters for the given QC configuration.
2792
2793    Due to the database schema, booleans are stored as integers: 0 for False and 1 for True. They need to be
2794    cast back to booleans in get_qc_configuration_parameters(). A schema change would remove the bloat.
2795
2796    Args:
2797        config_name (str):
2798            Name of QC configuration
2799        intensity_dropouts_cutoff (int):
2800            Minimum number of internal standard intensity dropouts to constitute a QC fail
2801        library_rt_shift_cutoff (float):
2802            Maximum shift from library RT values to constitute a QC fail
2803        in_run_rt_shift_cutoff (float):
2804            Maximum shift from in-run RT values to constitute a QC fail
2805        library_mz_shift_cutoff (float):
2806            Maximum shift from library m/z values to constitute a QC fail
2807        intensity_enabled (bool):
2808            Enables / disables QC check for intensity dropout cutoffs
2809        library_rt_enabled (bool):
2810            Enables / disables QC check for library RT shifts
2811        in_run_rt_enabled (bool):
2812            Enables / disables QC check for in-run RT shifts
2813        library_mz_enabled (bool):
2814            Enables / disables QC check for library m/z shifts
2815
2816    Returns:
2817        None
2818    """
2819
2820    # Connect to database
2821    db_metadata, connection = connect_to_database("Settings")
2822
2823    # Get QC parameters table
2824    qc_parameters_table = sa.Table("qc_parameters", db_metadata, autoload=True)
2825
2826    # Prepare insert of user-inputted QC parameters
2827    update_parameters = (
2828        sa.update(qc_parameters_table)
2829            .where(qc_parameters_table.c.config_name == config_name)
2830            .values(intensity_dropouts_cutoff=intensity_dropouts_cutoff,
2831                    library_rt_shift_cutoff=library_rt_shift_cutoff,
2832                    in_run_rt_shift_cutoff=in_run_rt_shift_cutoff,
2833                    library_mz_shift_cutoff=library_mz_shift_cutoff,
2834                    intensity_enabled=intensity_enabled,
2835                    library_rt_enabled=library_rt_enabled,
2836                    in_run_rt_enabled=in_run_rt_enabled,
2837                    library_mz_enabled=library_mz_enabled)
2838    )
2839
2840    # Execute UPDATE to database, then close the connection
2841    connection.execute(update_parameters)
2842    connection.close()
2843
2844
2845def get_samples_in_run(instrument_id, run_id, sample_type="Both"):
2846
2847    """
2848    Returns DataFrame of samples for a given instrument run from instrument database.
2849
2850    Args:
2851        instrument_id (str):
2852            Instrument ID
2853        run_id (str):
2854            Instrument run ID (job ID)
2855        sample_type (str):
2856            Sample type, either "Sample" or "Biological Standard" or "Both"
2857
2858    Returns:
2859        DataFrame of sample tables for a given instrument run.
2860    """
2861
2862    if sample_type == "Sample":
2863        df = get_table(instrument_id, "sample_qc_results")
2864
2865    elif sample_type == "Biological Standard":
2866        df = get_table(instrument_id, "bio_qc_results")
2867
2868    elif sample_type == "Both":
2869        df_samples = get_table(instrument_id, "sample_qc_results")
2870        df_bio_standards = get_table(instrument_id, "bio_qc_results")
2871        df_bio_standards.drop(columns=["biological_standard"], inplace=True)
2872        df = df_bio_standards.append(df_samples, ignore_index=True)
2873
2874    return df.loc[df["run_id"] == run_id]
2875
2876
2877def get_samples_from_csv(instrument_id, run_id, sample_type="Both"):
2878
2879    """
2880    Returns DataFrame of samples in a given run using CSV files from Google Drive.
2881
2882    CSV files of the run metadata, samples, and biological standards tables are stored
2883    in the ../data/Instrument_ID_Run_ID/csv directory, and removed on job completion.
2884
2885    Args:
2886        instrument_id (str):
2887            Instrument ID
2888        run_id (str):
2889            Instrument run ID (job ID)
2890        sample_type (str):
2891            Sample type, either "Sample" or "Biological Standard" or "Both"
2892
2893    Returns:
2894        DataFrame of samples for a given instrument run.
2895    """
2896
2897    id = instrument_id.replace(" ", "_") + "_" + run_id
2898    csv_directory = os.path.join(data_directory, id, "csv")
2899
2900    samples_csv = os.path.join(csv_directory, "samples.csv")
2901    bio_standards_csv = os.path.join(csv_directory, "bio_standards.csv")
2902
2903    if sample_type == "Sample":
2904        df = pd.read_csv(samples_csv, index_col=False)
2905
2906    elif sample_type == "Biological Standard":
2907        df = pd.read_csv(bio_standards_csv, index_col=False)
2908
2909    elif sample_type == "Both":
2910        df_samples = pd.read_csv(samples_csv, index_col=False)
2911        df_bio_standards = pd.read_csv(bio_standards_csv, index_col=False)
2912        df_bio_standards.drop(columns=["biological_standard"], inplace=True)
2913        df = df_bio_standards.append(df_samples, ignore_index=True)
2914
2915    df = df.loc[df["run_id"] == run_id]
2916
2917    try:
2918        df.drop(columns=["id"], inplace=True)
2919    finally:
2920        return df
2921
2922
2923def get_next_sample(sample_id, instrument_id, run_id):
2924
2925    """
2926    Returns sample following the given sample, or None if last sample.
2927
2928    Args:
2929        sample_id (str):
2930            Sample ID
2931        instrument_id (str):
2932            Instrument ID
2933        run_id (str):
2934            Instrument run ID (job ID)
2935
2936    Returns:
2937        str: The next sample in the instrument run after the given sample ID, or None if last sample.
2938    """
2939
2940    # Get list of samples in run
2941    samples = get_samples_in_run(instrument_id, run_id, "Both")["sample_id"].astype(str).tolist()
2942
2943    # Find sample in list
2944    sample_index = samples.index(sample_id)
2945    next_sample_index = sample_index + 1
2946
2947    # Return next sample
2948    if next_sample_index != len(samples):
2949        return samples[next_sample_index]
2950    else:
2951        return None
2952
2953
2954def get_remaining_samples(instrument_id, run_id):
2955
2956    """
2957    Returns list of samples remaining in a given instrument run (QC job).
2958
2959    TODO: This function should just return the samples with null values in the "qc_result" column.
2960        The "latest_sample" value in the "runs" table may be unreliable.
2961
2962    Args:
2963        instrument_id (str):
2964            Instrument ID
2965        run_id (str):
2966            Instrument run ID (job ID)
2967
2968    Returns:
2969        list: List of samples remaining in a QC job.
2970    """
2971
2972    # Get last processed sample in run
2973    df_run = get_instrument_run(instrument_id, run_id)
2974    latest_sample = df_run["latest_sample"].astype(str).values[0]
2975
2976    # Get list of samples in run
2977    samples = get_samples_in_run(instrument_id, run_id, "Both")["sample_id"].astype(str).tolist()
2978
2979    # Return all samples if beginning of run
2980    if latest_sample == "None":
2981        return samples
2982
2983    # Get index of latest sample
2984    latest_sample_index = samples.index(latest_sample)
2985
2986    # Return list of samples starting at latest sample
2987    return samples[latest_sample_index:len(samples)]
2988
2989
2990def get_unprocessed_samples(instrument_id, run_id):
2991
2992    """
2993    For an active run, returns 1) a list of samples that were not processed due to error / runtime termination,
2994    and 2) the current sample being monitored / processed.
2995
2996    Args:
2997        instrument_id (str):
2998            Instrument ID
2999        run_id (str):
3000            Instrument run ID (job ID)
3001
3002    Returns:
3003        tuple: List of unprocessed samples for the given instrument run, and current sample being monitored / processed.
3004    """
3005
3006    # Get samples in run
3007    df_samples = get_samples_in_run(instrument_id, run_id, "Both")
3008
3009    # Get list of samples in run
3010    samples = df_samples["sample_id"].astype(str).tolist()
3011
3012    # Construct dictionary of unprocessed samples in instrument run
3013    df_unprocessed_samples = df_samples.loc[df_samples["qc_result"].isnull()]
3014    unprocessed_samples = df_unprocessed_samples["sample_id"].astype(str).tolist()
3015
3016    # Get acquisition path, data files, and data file extension
3017    acquisition_path = get_acquisition_path(instrument_id, run_id)
3018    extension = get_data_file_type(instrument_id)
3019    directory_files = os.listdir(acquisition_path)
3020    data_files = [file.split(".")[0] for file in directory_files if file.split(".")[0] in unprocessed_samples]
3021
3022    # Mark acquired data files
3023    df_unprocessed_samples.loc[
3024        df_unprocessed_samples["sample_id"].isin(data_files), "found"] = "Found"
3025    unprocessed_samples = df_unprocessed_samples.dropna(subset=["found"])["sample_id"].astype(str).tolist()
3026
3027    # Get current sample
3028    if len(unprocessed_samples) > 0:
3029        current_sample = unprocessed_samples[-1]
3030        del unprocessed_samples[-1]
3031    else:
3032        current_sample = None
3033
3034    # Return as tuple
3035    return unprocessed_samples, current_sample
3036
3037
3038def get_current_sample(instrument_id, run_id):
3039
3040    """
3041    Returns the current sample being monitored / processed.
3042
3043    TODO: The "latest_sample" is the last sample to be processed. Nomenclature needs to be updated in many places.
3044
3045    Args:
3046        instrument_id (str):
3047            Instrument ID
3048        run_id (str):
3049            Instrument run ID (job ID)
3050
3051    Returns:
3052        str: Current sample being monitored / processed.
3053    """
3054
3055    # Get latest sample in run
3056    df_run = get_instrument_run(instrument_id, run_id)
3057    latest_sample = df_run["latest_sample"].astype(str).values[0]
3058
3059    # Return second sample if beginning of run
3060    if latest_sample == "None":
3061        return samples[1]
3062
3063
3064def parse_internal_standard_data(instrument_id, run_id, result_type, polarity, load_from, as_json=True):
3065
3066    """
3067    Parses data from database into JSON-ified DataFrame for samples (as rows) vs. internal standards (as columns).
3068
3069    Data is stored in a column (for example, "retention_time") as a single-record string dict with the following structure:
3070
3071    | Sample     | iSTD 1 | iSTD 2 | ... |
3072    | ---------- | ------ | ------ | ... |
3073    | SAMPLE_001 | 1.207  | 1.934  | ... |
3074
3075    These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().
3076
3077    Args:
3078        instrument_id (str):
3079            Instrument ID
3080        run_id (str):
3081            Instrument run ID (job ID)
3082        result_type (str):
3083            Column in sample_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
3084        polarity (str):
3085            Polarity ("Pos" or "Neg")
3086        load_from (str):
3087            Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
3088        as_json (bool, default True):
3089            Whether to return table as JSON string or as DataFrame
3090
3091    Returns:
3092        DataFrame of samples (rows) vs. internal standards (columns) as JSON string.
3093    """
3094
3095    # Get relevant QC results table from database
3096    if load_from == "database" or load_from == "processing":
3097        df_samples = get_samples_in_run(instrument_id, run_id, "Sample")
3098    elif load_from == "csv":
3099        df_samples = get_samples_from_csv(instrument_id, run_id, "Sample")
3100
3101    # Filter by polarity
3102    df_samples = df_samples.loc[df_samples["polarity"] == polarity]
3103    sample_ids = df_samples["sample_id"].astype(str).tolist()
3104
3105    # Return None if results are None
3106    if load_from == "processing":
3107        if len(df_samples[result_type].dropna()) == 0:
3108            return None
3109
3110    # Initialize DataFrame with individual records of sample data
3111    results = df_samples[result_type].astype(str).tolist()
3112    results = [ast.literal_eval(result) if result != "None" and result != "nan" else {} for result in results]
3113    df_results = pd.DataFrame(results)
3114    df_results.drop(columns=["Name"], inplace=True)
3115    df_results["Sample"] = sample_ids
3116
3117    # Return DataFrame as JSON string
3118    if as_json:
3119        return df_results.to_json(orient="records")
3120    else:
3121        return df_results
3122
3123
3124def parse_biological_standard_data(instrument_id, run_id, result_type, polarity, biological_standard, load_from, as_json=True):
3125
3126    """
3127    Parses biological standard data into JSON-ified DataFrame of targeted features (as columns) vs. instrument runs (as rows).
3128
3129    The bio_qc_results table in the instrument database is first filtered by biological standard, chromatography, and polarity.
3130    Then, the sample name is replaced with the instrument run it was associated with.
3131
3132    Data is stored in a column (for example, "intensity") as a single-record string dict with the following structure:
3133
3134    | Name                | Metabolite 1 | Metabolite 2 | ... |
3135    | ------------------- | ------------ | ------------ | ... |
3136    | INSTRUMENT_RUN_001  | 13597340     | 53024853     | ... |
3137
3138    These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().
3139
3140    | Name                | Metabolite 1 | Metabolite 2 | ... |
3141    | ------------------- | ------------ | ------------ | ... |
3142    | INSTRUMENT_RUN_001  | 13597340     | 53024853     | ... |
3143    | INSTRUMENT_RUN_002  | 23543246     | 102030406    | ... |
3144    | ...                 | ...          | ...          | ... |
3145
3146    Args:
3147        instrument_id (str):
3148            Instrument ID
3149        run_id (str):
3150            Instrument run ID (job ID)
3151        result_type (str):
3152            Column in bio_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
3153        polarity (str):
3154            Polarity ("Pos" or "Neg")
3155        biological_standard (str):
3156            Name of biological standard
3157        load_from (str):
3158            Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
3159        as_json (bool, default True):
3160            Whether to return table as JSON string or as DataFrame
3161
3162    Returns:
3163        JSON-ified DataFrame of targeted features for a biological standard (columns) vs. instrument runs (rows).
3164    """
3165
3166    # Get relevant QC results table from database
3167    if load_from == "database":
3168        df_samples = get_table(instrument_id, "bio_qc_results")
3169    elif load_from == "csv":
3170        id = instrument_id.replace(" ", "_") + "_" + run_id
3171        bio_standards_csv = os.path.join(data_directory, id, "csv", "bio_standards.csv")
3172        df_samples = pd.read_csv(bio_standards_csv, index_col=False)
3173
3174    # Filter by biological standard type
3175    df_samples = df_samples.loc[df_samples["biological_standard"] == biological_standard]
3176
3177    # Filter by polarity
3178    df_samples = df_samples.loc[df_samples["polarity"] == polarity]
3179
3180    # Filter by instrument
3181    df_runs = get_table(instrument_id, "runs")
3182    chromatography = df_runs.loc[df_runs["run_id"] == run_id]["chromatography"].values[0]
3183
3184    # Filter by chromatography
3185    run_ids = df_runs.loc[df_runs["chromatography"] == chromatography]["run_id"].astype(str).tolist()
3186    df_samples = df_samples.loc[df_samples["run_id"].isin(run_ids)]
3187    run_ids = df_samples["run_id"].astype(str).tolist()
3188
3189    # Initialize DataFrame with individual records of sample data
3190    results = df_samples[result_type].fillna('{}').tolist()
3191    results = [ast.literal_eval(result) if result != "None" and result != "nan" else {} for result in results]
3192    df_results = pd.DataFrame(results)
3193    df_results["Name"] = run_ids
3194
3195    # Return DataFrame as JSON string
3196    if as_json:
3197        return df_results.to_json(orient="records")
3198    else:
3199        return df_results
3200
3201
3202def parse_internal_standard_qc_data(instrument_id, run_id, polarity, result_type, load_from, as_json=True):
3203
3204    """
3205    Parses QC data into JSON-ified DataFrame for samples (as rows) vs. internal standards (as columns).
3206
3207    The QC DataFrame is stored in the "qc_dataframe" column as a single-record string dict with the following structure:
3208
3209    | Sample     | Delta m/z | Delta RT | In-run delta RT | Warnings | Fails |
3210    | ---------- | --------- | -------- | --------------- | -------- | ----- |
3211    | SAMPLE_001 | 0.000001  | 0.001    | 0.00001         | None     | None  |
3212
3213    These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().
3214
3215    Args:
3216        instrument_id (str):
3217            Instrument ID
3218        run_id (str):
3219            Instrument run ID (job ID)
3220        polarity (str):
3221            Polarity ("Pos" or "Neg")
3222        result_type (str):
3223            Column in sample_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
3224        load_from (str):
3225            Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
3226        as_json (bool, default True):
3227            Whether to return table as JSON string or as DataFrame
3228
3229    Returns:
3230        JSON-ified DataFrame of QC data for samples (as rows) vs. internal standards (as columns).
3231    """
3232
3233    # Get relevant QC results table from database
3234    if load_from == "database" or load_from == "processing":
3235        df_samples = get_samples_in_run(instrument_id, run_id, "Sample")
3236    elif load_from == "csv":
3237        df_samples = get_samples_from_csv(instrument_id, run_id, "Sample")
3238
3239    # Filter by polarity
3240    df_samples = df_samples.loc[df_samples["polarity"] == polarity]
3241
3242    # For results DataFrame, each index corresponds to the result type
3243    get_result_index = {
3244        "Delta m/z": 0,
3245        "Delta RT": 1,
3246        "In-run delta RT": 2,
3247        "Intensity dropout": 3,
3248        "Warnings": 4,
3249        "Fails": 5
3250    }
3251
3252    # Get list of results using result type
3253    sample_ids = df_samples["sample_id"].astype(str).tolist()
3254    results = df_samples["qc_dataframe"].fillna('[{}, {}, {}, {}, {}, {}]').astype(str).tolist()
3255
3256    type_index = get_result_index[result_type]
3257    results = [ast.literal_eval(result)[type_index] for result in results]
3258    df_results = pd.DataFrame(results)
3259    df_results.drop(columns=["Name"], inplace=True)
3260    df_results["Sample"] = sample_ids
3261
3262    # Return DataFrame as JSON string
3263    if as_json:
3264        return df_results.to_json(orient="records")
3265    else:
3266        return df_results
3267
3268
3269def get_workspace_users_list():
3270
3271    """
3272    Returns a list of users that have access to the MS-AutoQC workspace.
3273    """
3274
3275    return get_table("Settings", "gdrive_users")["email_address"].astype(str).tolist()
3276
3277
3278def add_user_to_workspace(email_address):
3279
3280    """
3281    Gives user access to workspace in Google Drive and stores email address in database.
3282
3283    Access is granted by sharing the MS-AutoQC folder in Google Drive with the user's Google account.
3284
3285    Args:
3286        email_address (str): Email address for Google account to grant access to workspace.
3287
3288    Returns:
3289        None
3290    """
3291
3292    if email_address in get_workspace_users_list():
3293        return "User already exists"
3294
3295    # Get Google Drive instance
3296    drive = get_drive_instance()
3297
3298    # Get ID of MS-AutoQC folder in Google Drive
3299    gdrive_folder_id = get_drive_folder_id()
3300
3301    if gdrive_folder_id is not None:
3302        # Add user access by updating permissions
3303        folder = drive.CreateFile({"id": gdrive_folder_id})
3304        permission = folder.InsertPermission({
3305            "type": "user",
3306            "role": "writer",
3307            "value": email_address})
3308
3309        # Insert user email address in "gdrive_users" table
3310        db_metadata, connection = connect_to_database("Settings")
3311        gdrive_users_table = sa.Table("gdrive_users", db_metadata, autoload=True)
3312
3313        insert_user_email = gdrive_users_table.insert().values(
3314            {"name": permission["name"],
3315             "email_address": email_address,
3316             "permission_id": permission["id"]})
3317
3318        connection.execute(insert_user_email)
3319        connection.close()
3320
3321    else:
3322        return "Error"
3323
3324
3325def delete_user_from_workspace(email_address):
3326
3327    """
3328    Removes user access to workspace in Google Drive and deletes email from database.
3329
3330    Args:
3331        email_address (str): Email address for Google account whose access will to be revoked.
3332
3333    Returns:
3334        None
3335    """
3336
3337    if email_address not in get_workspace_users_list():
3338        return "User does not exist"
3339
3340    # Get Google Drive instance
3341    drive = get_drive_instance()
3342
3343    # Get ID of MS-AutoQC folder in Google Drive
3344    gdrive_folder_id = get_drive_folder_id()
3345
3346    if gdrive_folder_id is not None:
3347        # Get permission ID of user from database
3348        folder = drive.CreateFile({"id": gdrive_folder_id})
3349        df_gdrive_users = get_table("Settings", "gdrive_users")
3350        df_gdrive_users = df_gdrive_users.loc[df_gdrive_users["email_address"] == email_address]
3351        permission_id = df_gdrive_users["permission_id"].astype(str).values[0]
3352
3353        # Delete user access by updating permissions
3354        folder.DeletePermission(permission_id)
3355
3356        # Delete user email address in "gdrive_users" table
3357        db_metadata, connection = connect_to_database("Settings")
3358        gdrive_users_table = sa.Table("gdrive_users", db_metadata, autoload=True)
3359
3360        delete_user_email = (
3361            sa.delete(gdrive_users_table)
3362                .where((gdrive_users_table.c.email_address == email_address))
3363        )
3364
3365        connection.execute(delete_user_email)
3366        connection.close()
3367
3368    else:
3369        return "Error"
3370
3371
3372def get_qc_results(instrument_id, sample_list, is_bio_standard=False):
3373
3374    """
3375    Returns DataFrame of QC results for a given sample list.
3376
3377    TODO: This function will break if samples in different runs have the same sample ID. Add run ID filter.
3378
3379    Args:
3380        instrument_id (str):
3381            Instrument ID
3382        sample_list (list):
3383            List of samples to query
3384        is_bio_standard (bool, default False):
3385            Whether the list is biological standards (True) or samples (False)
3386
3387    Returns:
3388        DataFrame of QC results for a given sample list.
3389    """
3390
3391    if len(sample_list) == 0:
3392        return pd.DataFrame()
3393
3394    database = get_database_file(instrument_id=instrument_id, sqlite_conn=True)
3395    engine = sa.create_engine(database)
3396
3397    sample_list = str(sample_list).replace("[", "(").replace("]", ")")
3398
3399    if is_bio_standard:
3400        query = "SELECT sample_id, qc_result FROM bio_qc_results WHERE sample_id in " + sample_list
3401    else:
3402        query = "SELECT sample_id, qc_result FROM sample_qc_results WHERE sample_id in " + sample_list
3403
3404    return pd.read_sql(query, engine)
3405
3406
3407def create_workspace_metadata():
3408
3409    """
3410    Creates record in "workspace" table to store various metadata.
3411    """
3412
3413    db_metadata, connection = connect_to_database("Settings")
3414    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3415    connection.execute(workspace_table.insert().values({"id": 1}))
3416    connection.close()
3417
3418
3419def get_device_identity():
3420
3421    """
3422    Returns device identity (either an Instrument ID or "Shared user").
3423    """
3424
3425    return get_table("Settings", "workspace")["instrument_identity"].astype(str).tolist()[0]
3426
3427
3428def set_device_identity(is_instrument_computer, instrument_id):
3429
3430    """
3431    Indicates whether the user's device is the instrument PC or not.
3432
3433    Args:
3434        is_instrument_computer (bool):
3435            Whether the device is an instrument computer or not
3436        instrument_id (str):
3437            Instrument ID (if None, set to "Shared user")
3438
3439    Returns:
3440        None
3441    """
3442
3443    if not is_instrument_computer:
3444        instrument_id = "Shared user"
3445
3446    db_metadata, connection = connect_to_database("Settings")
3447    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3448
3449    update_identity = (
3450        sa.update(workspace_table)
3451            .where(workspace_table.c.id == 1)
3452            .values(
3453                is_instrument_computer=is_instrument_computer,
3454                instrument_identity=instrument_id
3455        )
3456    )
3457
3458    connection.execute(update_identity)
3459    connection.close()
3460
3461
3462def run_is_on_instrument_pc(instrument_id, run_id):
3463
3464    """
3465    Validates that the current device is the instrument PC on which the run was started.
3466
3467    TODO: Use this function in PlotGeneration and DashWebApp module.
3468
3469    Args:
3470        instrument_id (str):
3471            Instrument ID
3472        run_id (str):
3473            Instrument run ID
3474
3475    Returns:
3476        True if instrument run was started on the current device, and False if not.
3477    """
3478
3479    instrument_id = get_instrument_run(instrument_id, run_id)["instrument_id"].astype(str).tolist()[0]
3480    device_identity = get_table("Settings", "workspace")["instrument_identity"].astype(str).tolist()[0]
3481
3482    if instrument_id == device_identity:
3483        return True
3484    else:
3485        return False
3486
3487
3488def update_slack_bot_token(slack_bot_token):
3489
3490    """
3491    Updates Slack bot user OAuth 2.0 token in "workspace" table of Settings database.
3492
3493    For details on the Slack API, see: https://slack.dev/python-slack-sdk/
3494
3495    Args:
3496        slack_bot_token (str): Slack bot user OAuth token
3497
3498    Returns:
3499        None
3500    """
3501
3502    db_metadata, connection = connect_to_database("Settings")
3503    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3504
3505    update_slack_bot_token = (
3506        sa.update(workspace_table)
3507            .where(workspace_table.c.id == 1)
3508            .values(slack_bot_token=slack_bot_token)
3509    )
3510
3511    connection.execute(update_slack_bot_token)
3512    connection.close()
3513
3514
3515def get_slack_bot_token():
3516
3517    """
3518    Returns Slack bot token stored in "workspace" table of Settings database.
3519    """
3520
3521    return get_table("Settings", "workspace")["slack_bot_token"].astype(str).values[0]
3522
3523
3524def update_slack_channel(slack_channel, notifications_enabled):
3525
3526    """
3527    Updates Slack channel registered for notifications in "workspace" table of Settings database.
3528
3529    Args:
3530        slack_channel (str):
3531            Slack channel to post messages to
3532        notifications_enabled (bool):
3533            Whether to send Slack notifications for QC warnings and fails
3534
3535    Returns:
3536        None
3537    """
3538
3539    db_metadata, connection = connect_to_database("Settings")
3540    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3541
3542    update_slack_channel = (
3543        sa.update(workspace_table)
3544            .where(workspace_table.c.id == 1)
3545            .values(
3546                slack_channel=slack_channel.replace("#", ""),
3547                slack_enabled=notifications_enabled)
3548    )
3549
3550    connection.execute(update_slack_channel)
3551    connection.close()
3552
3553
3554def get_slack_channel():
3555
3556    """
3557    Returns Slack channel registered for notifications.
3558    """
3559
3560    return get_table("Settings", "workspace")["slack_channel"].astype(str).values[0]
3561
3562
3563def get_slack_notifications_toggled():
3564
3565    """
3566    Returns Slack notification toggle setting.
3567    """
3568
3569    try:
3570        return get_table("Settings", "workspace")["slack_enabled"].astype(int).tolist()[0]
3571    except:
3572        return None
3573
3574
3575def get_email_notifications_list(as_string=False):
3576
3577    """
3578    Returns list of emails registered for email notifications for QC warnings and fails.
3579
3580    Args:
3581        as_string (bool, default False):
3582            Whether to return the list as a string (for Gmail API) or as list object (for display in Settings page)
3583
3584    Returns:
3585        List of emails registered for QC warning/fail notifications.
3586    """
3587
3588    email_list = get_table("Settings", "email_notifications")["email_address"].astype(str).tolist()
3589
3590    if as_string:
3591        email_list_string = ""
3592
3593        for email in email_list:
3594            email_list_string += email
3595            if email != email_list[-1]:
3596                email_list_string += ","
3597
3598        return email_list_string
3599
3600    else:
3601        return email_list
3602
3603
3604def register_email_for_notifications(email_address):
3605
3606    """
3607    Inserts email address into "email_notifications" table in Settings database.
3608
3609    Args:
3610        email_address (str): Email address to register for notifications.
3611
3612    Returns:
3613        None
3614    """
3615
3616    db_metadata, connection = connect_to_database("Settings")
3617    email_notifications_table = sa.Table("email_notifications", db_metadata, autoload=True)
3618
3619    insert_email_address = email_notifications_table.insert().values({
3620        "email_address": email_address
3621    })
3622
3623    connection.execute(insert_email_address)
3624    connection.close()
3625
3626
3627def delete_email_from_notifications(email_address):
3628
3629    """
3630    Deletes email address from "email_notifications" table in Settings database.
3631
3632    Args:
3633        email_address (str): Email address to unsubscribe from notifications.
3634
3635    Returns:
3636        None
3637    """
3638
3639    db_metadata, connection = connect_to_database("Settings")
3640    email_notifications_table = sa.Table("email_notifications", db_metadata, autoload=True)
3641
3642    delete_email_address = (
3643        sa.delete(email_notifications_table)
3644            .where((email_notifications_table.c.email_address == email_address))
3645    )
3646
3647    connection.execute(delete_email_address)
3648    connection.close()
3649
3650
3651def get_completed_samples_count(instrument_id, run_id, status):
3652
3653    """
3654    Returns tuple containing count for completed samples and total samples in a given instrument run.
3655
3656    Args:
3657        instrument_id (str):
3658            Instrument ID
3659        run_id (str):
3660            Instrument run ID (job ID)
3661        status (str):
3662            Instrument run (QC job) status, either "Active" or "Complete"
3663
3664    Returns:
3665        Tuple with number of completed samples and total samples for a given instrument run.
3666    """
3667
3668    if status == "Active" and sync_is_enabled():
3669        if get_device_identity() == instrument_id:
3670            df_instrument_run = get_instrument_run(instrument_id, run_id)
3671        else:
3672            df_instrument_run = get_instrument_run_from_csv(instrument_id, run_id)
3673    else:
3674        df_instrument_run = get_instrument_run(instrument_id, run_id)
3675
3676    completed = df_instrument_run["completed"].astype(int).tolist()[0]
3677    total_samples = df_instrument_run["samples"].astype(int).tolist()[0]
3678    return (completed, total_samples)
3679
3680
3681def get_run_progress(instrument_id, run_id, status):
3682
3683    """
3684    Returns progress of instrument run as a percentage of samples completed.
3685
3686    Args:
3687        instrument_id (str):
3688            Instrument ID
3689        run_id (str):
3690            Instrument run ID (job ID)
3691        status (str):
3692            Instrument run (QC job) status, either "Active" or "Complete"
3693
3694    Returns:
3695        float: Percent of samples processed for the given instrument run.
3696    """
3697
3698    completed, total_samples = get_completed_samples_count(instrument_id, run_id, status)
3699    percent_complete = (completed / total_samples) * 100
3700    return round(percent_complete, 1)
3701
3702
3703def update_sample_counters_for_run(instrument_id, run_id, latest_sample):
3704
3705    """
3706    Increments "completed" count, as well as "pass" and "fail" counts accordingly.
3707
3708    TODO: The "latest_sample" is the last sample to be processed / completed.
3709        Nomenclature should be updated for clarity.
3710
3711    Args:
3712        instrument_id (str):
3713            Instrument ID
3714        run_id (str):
3715            Instrument run ID (job ID)
3716        latest_sample (str):
3717            Last sample to be processed
3718
3719    Returns:
3720        None
3721    """
3722
3723    df = get_samples_in_run(instrument_id, run_id, "Both")
3724
3725    try:
3726        passes = int(df["qc_result"].value_counts()["Pass"])
3727    except:
3728        passes = 0
3729
3730    try:
3731        warnings = int(df["qc_result"].value_counts()["Warning"])
3732    except:
3733        warnings = 0
3734
3735    try:
3736        fails = int(df["qc_result"].value_counts()["Fail"])
3737    except:
3738        fails = 0
3739
3740    completed = passes + fails
3741
3742    db_metadata, connection = connect_to_database(instrument_id)
3743    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3744
3745    update_status = (
3746        sa.update(instrument_runs_table)
3747            .where(instrument_runs_table.c.run_id == run_id)
3748            .values(
3749                completed=completed,
3750                passes=passes,
3751                fails=fails,
3752                latest_sample=latest_sample
3753        )
3754    )
3755
3756    connection.execute(update_status)
3757    connection.close()
3758
3759
3760def mark_run_as_completed(instrument_id, run_id):
3761
3762    """
3763    Marks instrument run status as completed.
3764
3765    Args:
3766        instrument_id (str):
3767            Instrument ID
3768        run_id (str):
3769            Instrument run ID (job ID)
3770
3771    Returns:
3772        None
3773    """
3774
3775    db_metadata, connection = connect_to_database(instrument_id)
3776    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3777
3778    update_status = (
3779        sa.update(instrument_runs_table)
3780            .where(instrument_runs_table.c.run_id == run_id)
3781            .values(status="Complete")
3782    )
3783
3784    connection.execute(update_status)
3785    connection.close()
3786
3787
3788def skip_sample(instrument_id, run_id):
3789
3790    """
3791    Skips sample by setting "latest_sample" value for instrument run to the next sample.
3792
3793    This function was used after restarting the acquisition listener when MS-DIAL got stuck processing a corrupted file.
3794    Now that MS-DIAL runs in the background, it is deprecated and should be removed.
3795
3796    Args:
3797        instrument_id (str):
3798            Instrument ID
3799        run_id (str):
3800            Instrument run ID (job ID)
3801
3802    Returns:
3803        None
3804    """
3805
3806    # Get next sample
3807    samples = get_remaining_samples(instrument_id, run_id)
3808    next_sample = samples[1]
3809
3810    # Set latest sample to next sample
3811    db_metadata, connection = connect_to_database(instrument_id)
3812    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3813
3814    connection.execute((
3815        sa.update(instrument_runs_table)
3816            .where(instrument_runs_table.c.run_id == run_id)
3817            .values(latest_sample=next_sample)
3818    ))
3819
3820    connection.close()
3821
3822
3823def store_pid(instrument_id, run_id, pid):
3824
3825    """
3826    Stores acquisition listener subprocess ID to allow for checkup and termination.
3827
3828    Args:
3829        instrument_id (str):
3830            Instrument ID
3831        run_id (str):
3832            Instrument run ID (job ID)
3833        pid (str):
3834            Process ID for acquisition listener subprocess
3835
3836    Returns:
3837        None
3838    """
3839
3840    db_metadata, connection = connect_to_database(instrument_id)
3841    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3842
3843    update_pid = (
3844        sa.update(instrument_runs_table)
3845            .where(instrument_runs_table.c.run_id == run_id)
3846            .values(pid=pid)
3847    )
3848
3849    connection.execute(update_pid)
3850    connection.close()
3851
3852
3853def get_pid(instrument_id, run_id):
3854
3855    """
3856    Retrieves acquisition listener process ID from "runs" table in Settings database.
3857
3858    Args:
3859        instrument_id (str):
3860            Instrument ID
3861        run_id (str):
3862            Instrument run ID (job ID)
3863
3864    Returns:
3865        None
3866    """
3867
3868    try:
3869        return get_instrument_run(instrument_id, run_id)["pid"].astype(int).tolist()[0]
3870    except:
3871        return None
3872
3873
3874def upload_to_google_drive(file_dict):
3875
3876    """
3877    Uploads files to MS-AutoQC folder in Google Drive.
3878
3879    Args:
3880        file_dict (dict):
3881            Dictionary with key-value structure { filename : file path }
3882
3883    Returns:
3884        dict: Dictionary with key-value structure { filename : Google Drive ID }
3885    """
3886
3887    # Get Google Drive instance
3888    drive = get_drive_instance()
3889
3890    # Get Google Drive ID for the MS-AutoQC folder
3891    folder_id = get_drive_folder_id()
3892
3893    # Store Drive ID's of uploaded file(s)
3894    drive_ids = {}
3895
3896    # Validate Google Drive folder ID
3897    if folder_id is not None:
3898        if folder_id != "None" and folder_id != "":
3899
3900            # Upload each file to Google Drive
3901            for filename in file_dict.keys():
3902                if os.path.exists(file_dict[filename]):
3903                    metadata = {
3904                        "title": filename,
3905                        "parents": [{"id": folder_id}],
3906                    }
3907                    file = drive.CreateFile(metadata=metadata)
3908                    file.SetContentFile(file_dict[filename])
3909                    file.Upload()
3910
3911                    drive_ids[file["title"]] = file["id"]
3912
3913    return drive_ids
3914
3915
3916def upload_qc_results(instrument_id, run_id):
3917    
3918    """
3919    Uploads QC results for a given instrument run to Google Drive as CSV files.
3920
3921    Args:
3922        instrument_id (str):
3923            Instrument ID
3924        run_id (str):
3925            Instrument run ID (job ID)
3926
3927    Returns:
3928        None
3929    """
3930
3931    id = instrument_id.replace(" ", "_") + "_" + run_id
3932
3933    # Get Google Drive instance
3934    drive = get_drive_instance()
3935
3936    # Define file names and file paths
3937    run_filename = "run.csv"
3938    samples_csv_filename = "samples.csv"
3939    bio_standards_csv_filename = "bio_standards.csv"
3940
3941    run_directory = os.path.join(data_directory, id)
3942    if not os.path.exists(run_directory):
3943        os.makedirs(run_directory)
3944
3945    csv_directory = os.path.join(run_directory, "csv")
3946    if not os.path.exists(csv_directory):
3947        os.makedirs(csv_directory)
3948
3949    run_csv_path = os.path.join(csv_directory, run_filename)
3950    samples_csv_path = os.path.join(csv_directory, samples_csv_filename)
3951    bio_standards_csv_path = os.path.join(csv_directory, bio_standards_csv_filename)
3952
3953    # Convert sample and biological standard QC results from database into CSV files
3954    df_run = get_instrument_run(instrument_id, run_id)
3955    df_run.to_csv(run_csv_path, index=False)
3956
3957    df_samples = get_samples_in_run(instrument_id=instrument_id, run_id=run_id, sample_type="Sample")
3958    if len(df_samples) > 0:
3959        df_samples.to_csv(samples_csv_path, index=False)
3960
3961    df_bio_standards = get_table(instrument_id, "bio_qc_results")
3962    if len(df_bio_standards) > 0:
3963        df_bio_standards.to_csv(bio_standards_csv_path, index=False)
3964
3965    # Compress CSV files into a ZIP archive for faster upload
3966    zip_filename = id + ".zip"
3967    zip_file_path = zip_csv_files(
3968        input_directory=csv_directory, output_directory_and_name=os.path.join(run_directory, id))
3969
3970    zip_file = {zip_filename: zip_file_path}
3971
3972    # Get Google Drive ID for the CSV files ZIP archive
3973    zip_file_drive_id = get_instrument_run(instrument_id, run_id)["drive_id"].tolist()[0]
3974
3975    # Update existing ZIP archive in Google Drive
3976    if zip_file_drive_id is not None:
3977
3978        file = drive.CreateFile({
3979            "id": zip_file_drive_id,
3980            "title": zip_filename,
3981        })
3982
3983        # Execute upload
3984        file.SetContentFile(zip_file_path)
3985        file.Upload()
3986
3987    # If zip file Drive ID does not exist,
3988    else:
3989
3990        # Upload CSV files ZIP archive to Google Drive for first time
3991        drive_id = upload_to_google_drive(zip_file)[zip_filename]
3992
3993        # Store Drive ID of ZIP file in local database
3994        db_metadata, connection = connect_to_database(instrument_id)
3995        runs_table = sa.Table("runs", db_metadata, autoload=True)
3996
3997        connection.execute((
3998            sa.update(runs_table)
3999                .where(runs_table.c.run_id == run_id)
4000                .values(drive_id=drive_id)
4001        ))
4002
4003        connection.close()
4004
4005
4006def download_qc_results(instrument_id, run_id):
4007
4008    """
4009    Downloads CSV files of QC results from Google Drive and stores in /data directory.
4010
4011    Args:
4012        instrument_id (str):
4013            Instrument ID
4014        run_id (str):
4015            Instrument run ID (job ID)
4016
4017    Returns:
4018        tuple: Paths of run.csv, samples.csv, and bio_standards.csv, respectively.
4019    """
4020
4021    id = instrument_id.replace(" ", "_") + "_" + run_id
4022
4023    # Get Google Drive instance
4024    drive = get_drive_instance()
4025
4026    # Initialize directories
4027    run_directory = os.path.join(data_directory, id)
4028    if not os.path.exists(run_directory):
4029        os.makedirs(run_directory)
4030
4031    csv_directory = os.path.join(run_directory, "csv")
4032    if not os.path.exists(csv_directory):
4033        os.makedirs(csv_directory)
4034
4035    # Zip file
4036    zip_filename = id + ".zip"
4037    zip_file_path = os.path.join(run_directory, zip_filename)
4038
4039    # Get Google Drive folder ID
4040    gdrive_folder_id = get_drive_folder_id()
4041
4042    # Find and download ZIP archive of CSV files from Google Drive
4043    for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4044        if file["title"] == zip_filename:
4045            os.chdir(run_directory)
4046            file.GetContentFile(file["title"])
4047            os.chdir(root_directory)
4048            break
4049
4050    # Unzip archive
4051    unzip_csv_files(zip_file_path, csv_directory)
4052
4053    # Define and return file paths
4054    run_csv = os.path.join(csv_directory, "run.csv")
4055    samples_csv = os.path.join(csv_directory, "samples.csv")
4056    bio_standards_csv_file = os.path.join(csv_directory, "bio_standards.csv")
4057
4058    return (run_csv, samples_csv, bio_standards_csv_file)
4059
4060
4061def get_drive_folder_id():
4062
4063    """
4064    Returns Google Drive ID for the MS-AutoQC folder (found in user's root Drive directory).
4065    """
4066
4067    return get_table("Settings", "workspace")["gdrive_folder_id"].values[0]
4068
4069
4070def get_database_drive_id(instrument_id):
4071
4072    """
4073    Returns Google Drive ID for a given instrument's database.
4074
4075    Args:
4076        instrument_id (str): Instrument ID
4077
4078    Returns:
4079        str: Google Drive ID for the instrument database ZIP archive.
4080    """
4081
4082    df = get_table("Settings", "instruments")
4083    return df.loc[df["name"] == instrument_id]["drive_id"].values[0]
4084
4085
4086def upload_database(instrument_id, sync_settings=False):
4087
4088    """
4089    Uploads database file and methods directory to Google Drive as ZIP archives.
4090
4091    Args:
4092        instrument_id (str):
4093            Instrument ID for the instrument database to upload
4094        sync_settings (bool, default False):
4095            Whether to upload methods directory as well
4096
4097    Returns:
4098        str: Timestamp upon upload completion.
4099    """
4100
4101    # Get Google Drive ID's for the MS-AutoQC folder and database file
4102    gdrive_folder_id = get_drive_folder_id()
4103    instrument_db_file_id = get_database_drive_id(instrument_id)
4104
4105    # Get Google Drive instance
4106    drive = get_drive_instance()
4107
4108    # Vacuum database to optimize size
4109    execute_vacuum(instrument_id)
4110
4111    # Upload methods directory to Google Drive
4112    if sync_settings == True:
4113        upload_methods()
4114
4115    # Upload database to Google Drive
4116    if gdrive_folder_id is not None and instrument_db_file_id is not None:
4117
4118        # Upload zipped database
4119        zip_database(instrument_id=instrument_id)
4120        file = drive.CreateFile(
4121            {"id": instrument_db_file_id, "title": instrument_id.replace(" ", "_") + ".zip"})
4122        file.SetContentFile(get_database_file(instrument_id, zip=True))
4123        file.Upload()
4124
4125        # Save modifiedDate of database file
4126        remember_last_modified(database=instrument_id, modified_date=file["modifiedDate"])
4127
4128    else:
4129        return None
4130
4131    return time.strftime("%H:%M:%S")
4132
4133
4134def download_database(instrument_id, sync_settings=False):
4135
4136    """
4137    Downloads instrument database ZIP file from Google Drive.
4138
4139    This function is called when accessing an instrument database from a device other than the given instrument.
4140
4141    Args:
4142        instrument_id (str):
4143            Instrument ID for the instrument database to download
4144        sync_settings (bool, default False):
4145            Whether to download methods directory as well
4146
4147    Returns:
4148        str: Timestamp upon download completion.
4149    """
4150
4151    db_zip_file = instrument_id.replace(" ", "_") + ".zip"
4152
4153    # If the database was not modified by another instrument, skip download (for instruments only)
4154    if not database_was_modified(instrument_id):
4155        return None
4156
4157    # Get Google Drive instance
4158    drive = get_drive_instance()
4159
4160    # Get Google Drive ID's for the MS-AutoQC folder and database file
4161    gdrive_folder_id = get_drive_folder_id()
4162    instrument_db_file_id = get_instrument(instrument_id)["drive_id"].values[0]
4163
4164    # If Google Drive folder is found, look for database next
4165    if gdrive_folder_id is not None and instrument_db_file_id is not None:
4166
4167        # Download newly added / modified MSP files in MS-AutoQC > methods
4168        if sync_settings == True:
4169            download_methods(skip_check=True)
4170
4171        try:
4172            for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4173                if file["title"] == db_zip_file:
4174
4175                    # Download and unzip database
4176                    os.chdir(data_directory)                        # Change to data directory
4177                    file.GetContentFile(file["title"])              # Download database and get file ID
4178                    os.chdir(root_directory)                        # Return to root directory
4179                    unzip_database(instrument_id=instrument_id)     # Unzip database
4180
4181                    # Save modifiedDate of database file
4182                    remember_last_modified(database=instrument_id, modified_date=file["modifiedDate"])
4183
4184        except Exception as error:
4185            print("Error downloading database from Google Drive:", error)
4186            return None
4187    else:
4188        return None
4189
4190    return time.strftime("%H:%M:%S")
4191
4192
4193def upload_methods():
4194
4195    """
4196    Uploads methods directory ZIP archive to Google Drive.
4197    """
4198
4199    df_workspace = get_table("Settings", "workspace")
4200    methods_zip_file_id = df_workspace["methods_zip_file_id"].values[0]
4201
4202    # Vacuum database to optimize size
4203    execute_vacuum("Settings")
4204
4205    # Get Google Drive instance
4206    drive = get_drive_instance()
4207
4208    # Upload methods ZIP archive to Google Drive
4209    if methods_zip_file_id is not None:
4210
4211        # Upload zipped database
4212        methods_zip_file = zip_methods()
4213        file = drive.CreateFile({"id": methods_zip_file_id, "title": "methods.zip"})
4214        file.SetContentFile(methods_zip_file)
4215        file.Upload()
4216
4217        # Save modifiedDate of methods ZIP file
4218        remember_last_modified(database="Settings", modified_date=file["modifiedDate"])
4219
4220    else:
4221        return None
4222
4223
4224def download_methods(skip_check=False):
4225
4226    """
4227    Downloads methods directory ZIP archive from Google Drive.
4228
4229    Args:
4230        skip_check (bool, default False): If True, skips checking whether database was modified
4231
4232    Returns:
4233        None
4234    """
4235
4236    # If the database was not modified by another instrument, skip download (for instruments only)
4237    if not skip_check:
4238        if not database_was_modified("Settings"):
4239            return None
4240
4241    # Get device identity
4242    instrument_bool = is_instrument_computer()
4243    device_identity = get_device_identity()
4244
4245    # Get MS-DIAL directory
4246    try:
4247        msdial_directory = get_msdial_directory()
4248    except:
4249        msdial_directory = None
4250
4251    # Get Google Drive instance
4252    drive = get_drive_instance()
4253
4254    # Get Google Drive folder ID
4255    gdrive_folder_id = get_drive_folder_id()
4256
4257    try:
4258        # Download and unzip methods directory
4259        for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4260            if file["title"] == "methods.zip":
4261                os.chdir(data_directory)                # Change to data directory
4262                file.GetContentFile(file["title"])      # Download methods ZIP archive
4263                os.chdir(root_directory)                # Return to root directory
4264                unzip_methods()                         # Unzip methods directory
4265
4266                # Save modifiedDate of methods directory
4267                remember_last_modified(database="Settings", modified_date=file["modifiedDate"])
4268
4269    except Exception as error:
4270        print("Error downloading methods from Google Drive:", error)
4271        return None
4272
4273    # Update MS-DIAL directory
4274    update_msdial_directory(msdial_directory)
4275
4276    # Update user device identity
4277    set_device_identity(is_instrument_computer=instrument_bool, instrument_id=device_identity)
4278    return time.strftime("%H:%M:%S")
4279
4280
4281def remember_last_modified(database, modified_date):
4282
4283    """
4284    Stores last modified time of database file in Google Drive.
4285
4286    This function is called after file upload, and used for comparison before download.
4287
4288    Args:
4289        database (str):
4290            Name of database (either Instrument ID or "Settings")
4291        modified_date (str):
4292            Modified date of file uploaded to Google Drive
4293
4294    Returns:
4295        None
4296    """
4297
4298    db_metadata, connection = connect_to_database("Settings")
4299    instruments_table = sa.Table("instruments", db_metadata, autoload=True)
4300    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
4301
4302    if database == "Settings":
4303        connection.execute((
4304            sa.update(workspace_table)
4305                .where((workspace_table.c.id == 1))
4306                .values(methods_last_modified=modified_date)
4307        ))
4308    else:
4309        connection.execute((
4310            sa.update(instruments_table)
4311            .where((instruments_table.c.name == database))
4312            .values(last_modified=modified_date)
4313        ))
4314
4315    connection.close()
4316
4317
4318def database_was_modified(database_name):
4319
4320    """
4321    Returns True if workspace file was modified by another instrument PC in Google Drive, and False if not.
4322
4323    Args:
4324        database_name (str): Name of database
4325
4326    Returns:
4327        Returns True if workspace file was modified by another instrument PC in Google Drive, and False if not.
4328    """
4329
4330    # Get Google Drive folder ID from database
4331    gdrive_folder_id = get_drive_folder_id()
4332
4333    # Compare "last modified" values
4334    if database_name == "Settings":
4335        local_last_modified = get_table("Settings", "workspace")["methods_last_modified"].values[0]
4336        filename = "methods.zip"
4337    else:
4338        local_last_modified = get_instrument(database_name)["last_modified"].values[0]
4339        filename = database_name.replace(" ", "_") + ".zip"
4340
4341    # Get Google Drive instance
4342    drive = get_drive_instance()
4343
4344    drive_last_modified = None
4345    for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4346        if file["title"] == filename:
4347            drive_last_modified = file["modifiedDate"]
4348            break
4349
4350    if local_last_modified == drive_last_modified:
4351        return False
4352    else:
4353        return True
4354
4355
4356def send_sync_signal(folder_id):
4357
4358    """
4359    Uploads empty file to signal that an instrument PC is syncing to Google Drive.
4360
4361    TODO: This method is deprecated. Please remove if no plans for usage.
4362
4363    Args:
4364        folder_id (str): Google Drive folder ID
4365
4366    Returns:
4367        bool: True if sync signal was sent, False if not.
4368    """
4369
4370    # Get Google Drive instance
4371    drive = get_drive_instance()
4372
4373    try:
4374        drive.CreateFile(metadata={"title": "Syncing", "parents": [{"id": folder_id}]}).Upload()
4375        return True
4376    except:
4377        return False
4378
4379
4380def safe_to_upload(folder_id):
4381
4382    """
4383    Returns False if another device is currently uploading to Google Drive, else True.
4384
4385    TODO: This method is deprecated. Please remove if no plans for usage.
4386
4387    Args:
4388        folder_id (str): Google Drive folder ID
4389
4390    Returns:
4391        bool: False if another device is currently uploading to Google Drive, True if not.
4392    """
4393
4394    # Get Google Drive instance
4395    drive = get_drive_instance()
4396
4397    for file in drive.ListFile({"q": "'" + folder_id + "' in parents and trashed=false"}).GetList():
4398        if file["title"] == "Syncing":
4399            return False
4400
4401    return True
4402
4403
4404def remove_sync_signal(folder_id):
4405
4406    """
4407    Removes empty signal file to signal that an instrument PC has completed syncing to Google Drive.
4408
4409    TODO: This method is deprecated. Please remove if no plans for usage.
4410
4411    Args:
4412        folder_id (str): Google Drive folder ID
4413
4414    Returns:
4415        bool: True if sync signal was removed, False if not.
4416    """
4417
4418    # Get Google Drive instance
4419    drive = get_drive_instance()
4420
4421    try:
4422        for file in drive.ListFile({"q": "'" + folder_id + "' in parents and trashed=false"}).GetList():
4423            if file["title"] == "Syncing":
4424                file.Delete()
4425        return True
4426    except:
4427        return False
4428
4429
4430def delete_active_run_csv_files(instrument_id, run_id):
4431
4432    """
4433    Checks for and deletes CSV files from Google Drive at the end of an active instrument run.
4434
4435    Args:
4436        instrument_id (str):
4437            Instrument ID
4438        run_id (str):
4439            Instrument run ID (job ID)
4440
4441    Returns:
4442        None
4443    """
4444
4445    id = instrument_id.replace(" ", "_") + "_" + run_id
4446
4447    # Find zip archive of CSV files in Google Drive and delete it
4448    drive = get_drive_instance()
4449    gdrive_folder_id = get_drive_folder_id()
4450
4451    if gdrive_folder_id is not None:
4452        drive_file_list = drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList()
4453        for file in drive_file_list:
4454            if file["title"] == id + ".zip":
4455                file.Delete()
4456                break
4457
4458    # Delete Drive ID from database
4459    db_metadata, connection = connect_to_database(instrument_id)
4460    runs_table = sa.Table("runs", db_metadata, autoload=True)
4461
4462    connection.execute((
4463        sa.update(runs_table)
4464            .where(runs_table.c.run_id == run_id)
4465            .values(drive_id=None)
4466    ))
4467
4468    connection.close()
4469
4470
4471def sync_on_run_completion(instrument_id, run_id):
4472
4473    """
4474    Syncs database with Google Drive at the end of an active instrument run.
4475
4476    Performs the following actions:
4477        1. Upload database to Google Drive
4478        2. Delete active run CSV files
4479
4480    Args:
4481        instrument_id (str):
4482            Instrument ID
4483        run_id (str):
4484            Instrument run ID (job ID)
4485
4486    Returns:
4487        None
4488    """
4489
4490    # Get Google Drive instance and folder ID
4491    drive = get_drive_instance()
4492    gdrive_folder_id = get_drive_folder_id()
4493
4494    # Upload database to Google Drive
4495    try:
4496        upload_database(instrument_id)
4497    except Exception as error:
4498        print("sync_on_run_completion() – Error uploading database during sync", error)
4499        return None
4500
4501    # Delete active run CSV files
4502    try:
4503        delete_active_run_csv_files(instrument_id, run_id)
4504    except Exception as error:
4505        print("sync_on_run_completion() – Error deleting CSV files after sync", error)
4506        return None
4507
4508
4509def get_data_file_type(instrument_id):
4510
4511    """
4512    Returns expected data file extension based on instrument vendor type.
4513
4514    TODO: Modify this function as needed when adding support for other instrument vendors.
4515
4516    Args:
4517        instrument_id (str): Instrument ID
4518
4519    Returns:
4520        Data file extension for instrument vendor.
4521    """
4522
4523    engine = sa.create_engine(settings_database)
4524    df_instruments = pd.read_sql("SELECT * FROM instruments WHERE name='" + instrument_id + "'", engine)
4525    vendor = df_instruments["vendor"].astype(str).tolist()[0]
4526
4527    if vendor == "Thermo Fisher":
4528        return "raw"
4529    elif vendor == "Agilent":
4530        return "d"
4531    elif vendor == "Bruker":
4532        return "baf"
4533    elif vendor == "Waters":
4534        return "raw"
4535    elif vendor == "Sciex":
4536        return "wiff2"
4537
4538
4539def is_completed_run(instrument_id, run_id):
4540
4541    """
4542    Returns True if the given QC job is for a completed run, and False if for an active run.
4543
4544    Args:
4545        instrument_id (str):
4546            Instrument ID
4547        run_id (str):
4548            Instrument run ID (job ID)
4549
4550    Returns:
4551        bool: True if the job is for a completed run, and False if job is for an active run.
4552    """
4553
4554    try:
4555        job_type = get_instrument_run(instrument_id, run_id)["job_type"].astype(str).values[0]
4556        if job_type == "completed":
4557            return True
4558        else:
4559            return False
4560    except:
4561        print("Could not get MS-AutoQC job type.")
4562        traceback.print_exc()
4563        return False
4564
4565
4566def delete_temp_directory(instrument_id, run_id):
4567
4568    """
4569    Deletes temporary data file directory in local app directory.
4570
4571    This function is called at the end of an instrument run (QC job).
4572
4573    Args:
4574        instrument_id (str):
4575            Instrument ID
4576        run_id (str):
4577            Instrument run ID (job ID)
4578
4579    Returns:
4580        None
4581    """
4582
4583    # Delete temporary data file directory
4584    try:
4585        id = instrument_id.replace(" ", "_") + "_" + run_id
4586        temp_directory = os.path.join(data_directory, id)
4587        if os.path.exists(temp_directory):
4588            shutil.rmtree(temp_directory)
4589    except:
4590        print("Could not delete temporary data directory.")
4591
4592
4593def pipeline_valid(module=None):
4594
4595    """
4596    Validates that MSConvert and MS-DIAL dependencies are installed.
4597
4598    This function is called during job setup validation.
4599
4600    Args:
4601        module (str, default None): If specified, only validates given module.
4602
4603    Returns:
4604        bool: Whether MSConvert.exe and MsdialConsoleApp.exe exist.
4605    """
4606
4607    try:
4608        msconvert_installed = os.path.exists(os.path.join(get_msconvert_directory(), "msconvert.exe"))
4609    except:
4610        msconvert_installed = False
4611
4612    try:
4613        msdial_installed = os.path.exists(os.path.join(get_msdial_directory(), "MsdialConsoleApp.exe"))
4614    except:
4615        msdial_installed = False
4616
4617    if module == "msdial":
4618        return msdial_installed
4619    elif module == "msconvert":
4620        return msconvert_installed
4621    else:
4622        return msconvert_installed and msdial_installed
4623
4624
4625def send_email(subject, message_body):
4626
4627    """
4628    Sends email using Google authenticated credentials.
4629
4630    This function is called for QC warnings and fails if:
4631        1. Google Drive sync is enabled
4632        2. Email addresses are registered for notifications
4633
4634    Args:
4635        subject (str):
4636            Subject of email
4637        message_body (str):
4638            Body of email
4639
4640    Returns:
4641        On success, an email.message.EmailMessage object.
4642    """
4643
4644    try:
4645        credentials = google_auth.load_credentials_from_file(alt_credentials)[0]
4646
4647        service = build("gmail", "v1", credentials=credentials)
4648        message = EmailMessage()
4649
4650        message.set_content(message_body)
4651
4652        message["Subject"] = subject
4653        message["To"] = get_email_notifications_list(as_string=True)
4654
4655        encoded_message = base64.urlsafe_b64encode(message.as_bytes()).decode()
4656        create_message = { "raw": encoded_message }
4657
4658        send_message = (service.users().messages().send(userId="me", body=create_message).execute())
4659
4660    except Exception as error:
4661        traceback.print_exc()
4662        send_message = None
4663
4664    return send_message
auth_container = [<pydrive2.auth.GoogleAuth object>]

The functions defined below operate on two database types:

  • One storing instrument run metadata, sample QC results, and biological standard QC results
  • The other storing instrument metadata, workspace settings for workspace access, chromatography methods, biological standards, QC configurations, and MS-DIAL configurations

In addition, this file also contains methods for syncing data and settings with Google Drive. To get an overview of all functions, please visit the documentation on https://czbiohub.github.io/MS-AutoQC.

def get_database_file(instrument_id, sqlite_conn=False, zip=False):
51def get_database_file(instrument_id, sqlite_conn=False, zip=False):
52
53    """
54    Returns database file for a given instrument ID.
55
56    Args:
57        instrument_id (str):
58            Instrument ID that specifies which database file to retrieve
59        sqlite_conn (bool, default False):
60            Whether to receive the path for establishing a SQLite connection
61        zip (bool, default False):
62            Whether to receive the path of the database file in the local app directory
63
64    Returns:
65        str: Path for the database file
66    """
67
68    if zip:
69        filename = instrument_id.replace(" ", "_") + ".zip"
70    else:
71        filename = instrument_id.replace(" ", "_") + ".db"
72
73    if sqlite_conn:
74        return "sqlite:///data/" + filename
75    else:
76        return os.path.join(data_directory, filename)

Returns database file for a given instrument ID.

Arguments:
  • instrument_id (str): Instrument ID that specifies which database file to retrieve
  • sqlite_conn (bool, default False): Whether to receive the path for establishing a SQLite connection
  • zip (bool, default False): Whether to receive the path of the database file in the local app directory
Returns:

str: Path for the database file

def connect_to_database(name):
 79def connect_to_database(name):
 80
 81    """
 82    Establishes a connection to a SQLite database of choice
 83
 84    Args:
 85        name (str):
 86            Name of the database, either "Settings" or an instrument ID
 87
 88    Returns:
 89        sqlalchemy.MetaData:
 90            A container object that consists of different features of a database being described
 91        sqlalchemy.Connection:
 92            An object that represents a single DBAPI connection, and always emits SQL statements within
 93            the context of a transaction block
 94    """
 95
 96    if name == "Settings":
 97        database_file = settings_database
 98    else:
 99        database_file = get_database_file(instrument_id=name, sqlite_conn=True)
100
101    engine = sa.create_engine(database_file)
102    db_metadata = sa.MetaData(bind=engine)
103    connection = engine.connect()
104
105    return db_metadata, connection

Establishes a connection to a SQLite database of choice

Arguments:
  • name (str): Name of the database, either "Settings" or an instrument ID
Returns:

sqlalchemy.MetaData: A container object that consists of different features of a database being described sqlalchemy.Connection: An object that represents a single DBAPI connection, and always emits SQL statements within the context of a transaction block

def create_databases(instrument_id, new_instrument=False):
108def create_databases(instrument_id, new_instrument=False):
109
110    """
111    Initializes SQLite databases for 1) instrument data and 2) workspace settings.
112
113    Creates the following tables in the instrument database: "runs", "bio_qc_results", "sample_qc_results".
114
115    Creates the following tables in the settings database: "biological_standards", "chromatography_methods",
116    "email_notifications", "instruments", "gdrive_users", "internal_standards", "msdial_parameters", "qc_parameters",
117    "targeted_features", "workspace".
118
119    Args:
120        instrument_id (str):
121            Instrument ID to name the new database ("Thermo QE 1" becomes "Thermo_QE_1.db")
122        new_instrument (bool, default False):
123            Whether a new instrument database is being added to a workspace, or whether a new
124            instrument database AND settings database are being created for the first time
125
126    Returns:
127        None
128    """
129
130    # Create tables for instrument database
131    instrument_database = get_database_file(instrument_id=instrument_id, sqlite_conn=True)
132    qc_db_engine = sa.create_engine(instrument_database)
133    qc_db_metadata = sa.MetaData()
134
135    bio_qc_results = sa.Table(
136        "bio_qc_results", qc_db_metadata,
137        sa.Column("id", INTEGER, primary_key=True),
138        sa.Column("sample_id", TEXT),
139        sa.Column("run_id", TEXT),
140        sa.Column("polarity", TEXT),
141        sa.Column("precursor_mz", TEXT),
142        sa.Column("retention_time", TEXT),
143        sa.Column("intensity", TEXT),
144        sa.Column("md5", TEXT),
145        sa.Column("qc_dataframe", TEXT),
146        sa.Column("qc_result", TEXT),
147        sa.Column("biological_standard", TEXT),
148        sa.Column("position", TEXT)
149    )
150
151    runs = sa.Table(
152        "runs", qc_db_metadata,
153        sa.Column("id", INTEGER, primary_key=True),
154        sa.Column("run_id", TEXT),
155        sa.Column("chromatography", TEXT),
156        sa.Column("acquisition_path", TEXT),
157        sa.Column("sequence", TEXT),
158        sa.Column("metadata", TEXT),
159        sa.Column("status", TEXT),
160        sa.Column("samples", INTEGER),
161        sa.Column("completed", INTEGER),
162        sa.Column("passes", INTEGER),
163        sa.Column("fails", INTEGER),
164        sa.Column("latest_sample", TEXT),
165        sa.Column("qc_config_id", TEXT),
166        sa.Column("biological_standards", TEXT),
167        sa.Column("pid", INTEGER),
168        sa.Column("drive_id", TEXT),
169        sa.Column("sample_status", TEXT),
170        sa.Column("job_type", TEXT)
171    )
172
173    sample_qc_results = sa.Table(
174        "sample_qc_results", qc_db_metadata,
175        sa.Column("id", INTEGER, primary_key=True),
176        sa.Column("sample_id", TEXT),
177        sa.Column("run_id", TEXT),
178        sa.Column("polarity", TEXT),
179        sa.Column("position", TEXT),
180        sa.Column("md5", TEXT),
181        sa.Column("precursor_mz", TEXT),
182        sa.Column("retention_time", TEXT),
183        sa.Column("intensity", TEXT),
184        sa.Column("qc_dataframe", TEXT),
185        sa.Column("qc_result", TEXT)
186    )
187
188    qc_db_metadata.create_all(qc_db_engine)
189
190    # If only creating instrument database, save and return here
191    if new_instrument:
192        set_device_identity(is_instrument_computer=True, instrument_id=instrument_id)
193        return None
194
195    # Create tables for Settings.db
196    settings_db_engine = sa.create_engine(settings_database)
197    settings_db_metadata = sa.MetaData()
198
199    instruments = sa.Table(
200        "instruments", settings_db_metadata,
201        sa.Column("id", INTEGER, primary_key=True),
202        sa.Column("name", TEXT),
203        sa.Column("vendor", TEXT),
204        sa.Column("drive_id", TEXT),
205        sa.Column("last_modified", TEXT)
206    )
207
208    biological_standards = sa.Table(
209        "biological_standards", settings_db_metadata,
210        sa.Column("id", INTEGER, primary_key=True),
211        sa.Column("name", TEXT),
212        sa.Column("identifier", TEXT),
213        sa.Column("chromatography", TEXT),
214        sa.Column("num_pos_features", INTEGER),
215        sa.Column("num_neg_features", INTEGER),
216        sa.Column("pos_bio_msp_file", TEXT),
217        sa.Column("neg_bio_msp_file", TEXT),
218        sa.Column("pos_parameter_file", TEXT),
219        sa.Column("neg_parameter_file", TEXT),
220        sa.Column("msdial_config_id", TEXT)
221    )
222
223    chromatography_methods = sa.Table(
224        "chromatography_methods", settings_db_metadata,
225        sa.Column("id", INTEGER, primary_key=True),
226        sa.Column("method_id", TEXT),
227        sa.Column("num_pos_standards", INTEGER),
228        sa.Column("num_neg_standards", INTEGER),
229        sa.Column("pos_istd_msp_file", TEXT),
230        sa.Column("neg_istd_msp_file", TEXT),
231        sa.Column("pos_parameter_file", TEXT),
232        sa.Column("neg_parameter_file", TEXT),
233        sa.Column("msdial_config_id", TEXT)
234    )
235
236    gdrive_users = sa.Table(
237        "gdrive_users", settings_db_metadata,
238        sa.Column("id", INTEGER, primary_key=True),
239        sa.Column("name", TEXT),
240        sa.Column("email_address", TEXT),
241        sa.Column("permission_id", TEXT),
242    )
243
244    internal_standards = sa.Table(
245        "internal_standards", settings_db_metadata,
246        sa.Column("id", INTEGER, primary_key=True),
247        sa.Column("name", TEXT),
248        sa.Column("chromatography", TEXT),
249        sa.Column("polarity", TEXT),
250        sa.Column("precursor_mz", REAL),
251        sa.Column("retention_time", REAL),
252        sa.Column("ms2_spectrum", TEXT),
253        sa.Column("inchikey", TEXT)
254    )
255
256    msdial_parameters = sa.Table(
257        "msdial_parameters", settings_db_metadata,
258        sa.Column("id", INTEGER, primary_key=True),
259        sa.Column("config_name", TEXT),
260        sa.Column("rt_begin", INTEGER),
261        sa.Column("rt_end", INTEGER),
262        sa.Column("mz_begin", INTEGER),
263        sa.Column("mz_end", INTEGER),
264        sa.Column("ms1_centroid_tolerance", REAL),
265        sa.Column("ms2_centroid_tolerance", REAL),
266        sa.Column("smoothing_method", TEXT),
267        sa.Column("smoothing_level", INTEGER),
268        sa.Column("min_peak_width", INTEGER),
269        sa.Column("min_peak_height", INTEGER),
270        sa.Column("mass_slice_width", REAL),
271        sa.Column("post_id_rt_tolerance", REAL),
272        sa.Column("post_id_mz_tolerance", REAL),
273        sa.Column("post_id_score_cutoff", REAL),
274        sa.Column("alignment_rt_tolerance", REAL),
275        sa.Column("alignment_mz_tolerance", REAL),
276        sa.Column("alignment_rt_factor", REAL),
277        sa.Column("alignment_mz_factor", REAL),
278        sa.Column("peak_count_filter", INTEGER),
279        sa.Column("qc_at_least_filter", TEXT)
280    )
281
282    email_notifications = sa.Table(
283        "email_notifications", settings_db_metadata,
284        sa.Column("id", INTEGER, primary_key=True),
285        sa.Column("email_address", TEXT),
286    )
287
288    qc_parameters = sa.Table(
289        "qc_parameters", settings_db_metadata,
290        sa.Column("id", INTEGER, primary_key=True),
291        sa.Column("config_name", TEXT),
292        sa.Column("intensity_dropouts_cutoff", INTEGER),
293        sa.Column("library_rt_shift_cutoff", REAL),
294        sa.Column("in_run_rt_shift_cutoff", REAL),
295        sa.Column("library_mz_shift_cutoff", REAL),
296        sa.Column("intensity_enabled", INTEGER),
297        sa.Column("library_rt_enabled", INTEGER),
298        sa.Column("in_run_rt_enabled", INTEGER),
299        sa.Column("library_mz_enabled", INTEGER)
300    )
301
302    targeted_features = sa.Table(
303        "targeted_features", settings_db_metadata,
304        sa.Column("id", INTEGER, primary_key=True),
305        sa.Column("name", TEXT),
306        sa.Column("chromatography", TEXT),
307        sa.Column("polarity", TEXT),
308        sa.Column("biological_standard", TEXT),
309        sa.Column("precursor_mz", REAL),
310        sa.Column("retention_time", REAL),
311        sa.Column("ms2_spectrum", TEXT),
312        sa.Column("inchikey", TEXT)
313    )
314
315    workspace = sa.Table(
316        "workspace", settings_db_metadata,
317        sa.Column("id", INTEGER, primary_key=True),
318        sa.Column("slack_bot_token", TEXT),
319        sa.Column("slack_channel", TEXT),
320        sa.Column("slack_enabled", INTEGER),
321        sa.Column("gdrive_folder_id", TEXT),
322        sa.Column("methods_zip_file_id", TEXT),
323        sa.Column("methods_last_modified", TEXT),
324        sa.Column("msdial_directory", TEXT),
325        sa.Column("is_instrument_computer", INTEGER),
326        sa.Column("instrument_identity", TEXT)
327    )
328
329    # Insert tables into database
330    settings_db_metadata.create_all(settings_db_engine)
331
332    # Insert default configurations for MS-DIAL and MS-AutoQC
333    add_msdial_configuration("Default")
334    add_qc_configuration("Default")
335
336    # Initialize workspace metadata
337    create_workspace_metadata()
338
339    # Save device identity based on setup values
340    set_device_identity(is_instrument_computer=True, instrument_id=instrument_id)
341    return None

Initializes SQLite databases for 1) instrument data and 2) workspace settings.

Creates the following tables in the instrument database: "runs", "bio_qc_results", "sample_qc_results".

Creates the following tables in the settings database: "biological_standards", "chromatography_methods", "email_notifications", "instruments", "gdrive_users", "internal_standards", "msdial_parameters", "qc_parameters", "targeted_features", "workspace".

Arguments:
  • instrument_id (str): Instrument ID to name the new database ("Thermo QE 1" becomes "Thermo_QE_1.db")
  • new_instrument (bool, default False): Whether a new instrument database is being added to a workspace, or whether a new instrument database AND settings database are being created for the first time
Returns:

None

def execute_vacuum(database):
344def execute_vacuum(database):
345
346    """
347    Executes VACUUM command on the database of choice.
348
349    Args:
350        database (str): name of the database, either "Settings" or Instrument ID
351
352    Returns:
353        None
354    """
355
356    db_metadata, connection = connect_to_database(database)
357    connection.execute("VACUUM")
358    connection.close()

Executes VACUUM command on the database of choice.

Arguments:
  • database (str): name of the database, either "Settings" or Instrument ID
Returns:

None

def get_drive_instance():
361def get_drive_instance():
362
363    """
364    Returns user-authenticated Google Drive instance.
365    """
366
367    return GoogleDrive(auth_container[0])

Returns user-authenticated Google Drive instance.

def launch_google_drive_authentication():
370def launch_google_drive_authentication():
371
372    """
373    Launches Google Drive authentication flow and sets authentication instance.
374    """
375
376    auth_container[0] = GoogleAuth(settings_file=drive_settings_file)
377    auth_container[0].LocalWebserverAuth()

Launches Google Drive authentication flow and sets authentication instance.

def save_google_drive_credentials():
380def save_google_drive_credentials():
381
382    """
383    Saves Google credentials to a credentials.txt file.
384    """
385
386    auth_container[0].SaveCredentialsFile(credentials_file)

Saves Google credentials to a credentials.txt file.

def initialize_google_drive():
389def initialize_google_drive():
390
391    """
392    Initializes instance of Google Drive using credentials.txt and settings.yaml in /auth directory
393
394    Args:
395        None
396
397    Returns:
398        bool: Whether the Google client credentials file (in the "auth" directory) exists.
399    """
400
401    # Create Google Drive instance
402    auth_container[0] = GoogleAuth(settings_file=drive_settings_file)
403    gauth = auth_container[0]
404
405    # If no credentials file, make user authenticate
406    if not os.path.exists(credentials_file) and is_valid():
407        gauth.LocalWebserverAuth()
408
409    # Try to load saved client credentials
410    gauth.LoadCredentialsFile(credentials_file)
411
412    # Initialize saved credentials
413    if gauth.credentials is not None:
414
415        # Refresh credentials if expired
416        if gauth.access_token_expired:
417            gauth.Refresh()
418
419        # Otherwise, authorize saved credentials
420        else:
421            gauth.Authorize()
422
423    # If no saved credentials, make user authenticate again
424    elif gauth.credentials is None:
425        gauth.LocalWebserverAuth()
426
427    if not os.path.exists(credentials_file) and is_valid():
428        save_google_drive_credentials()
429
430    # Makes small modification for emails (for usage with Google's google.auth)
431    if not os.path.exists(alt_credentials):
432        data = None
433        with open(credentials_file, "r") as file:
434            data = json.load(file)
435            data["type"] = "authorized_user"
436        with open(alt_credentials, "w") as file:
437            json.dump(data, file)
438
439    return os.path.exists(credentials_file)

Initializes instance of Google Drive using credentials.txt and settings.yaml in /auth directory

Arguments:
  • None
Returns:

bool: Whether the Google client credentials file (in the "auth" directory) exists.

def is_valid(instrument_id=None):
442def is_valid(instrument_id=None):
443
444    """
445    Checks that all required tables in all databases (or a single database of choice) are present.
446
447    Args:
448        instrument_id (str, default None):
449            Specified if validating a specific database
450
451    Returns:
452        None
453    """
454
455    # Validate settings database
456    settings_db_required_tables = ["biological_standards", "chromatography_methods", "email_notifications", "instruments",
457        "gdrive_users", "internal_standards", "msdial_parameters", "qc_parameters", "targeted_features", "workspace"]
458
459    try:
460        settings_db_tables = sa.create_engine(settings_database).table_names()
461        if len(settings_db_tables) < len(settings_db_required_tables):
462            return False
463    except:
464        return False
465
466    # Validate instrument databases
467    instrument_db_required_tables = ["bio_qc_results", "runs", "sample_qc_results"]
468
469    # If given an instrument ID, only validate that instrument's database
470    try:
471        if instrument_id is not None:
472            database = get_database_file(instrument_id, sqlite_conn=True)
473            instrument_db_tables = sa.create_engine(database).table_names()
474            if len(instrument_db_tables) < len(instrument_db_required_tables):
475                return False
476
477        # Otherwise, validate all instrument databases
478        else:
479            database_files = [file.replace(".db", "") for file in os.listdir(data_directory) if ".db" in file and "journal.db" not in file]
480            databases = [get_database_file(f, sqlite_conn=True) for f in database_files]
481
482            for database in databases:
483                instrument_db_tables = sa.create_engine(database).table_names()
484                if len(instrument_db_tables) < len(instrument_db_required_tables):
485                    return False
486    except:
487        return False
488
489    return True

Checks that all required tables in all databases (or a single database of choice) are present.

Arguments:
  • instrument_id (str, default None): Specified if validating a specific database
Returns:

None

def sync_is_enabled():
492def sync_is_enabled():
493
494    """
495    Checks whether Google Drive sync is enabled simply by querying whether Google Drive ID's exist in the database.
496
497    Typically used for separating sync-specific functionality.
498
499    Returns:
500        bool: Whether Google Drive sync is enabled or not
501    """
502
503    if not is_valid():
504        return False
505
506    df_workspace = get_table("Settings", "workspace")
507    gdrive_folder_id = df_workspace["gdrive_folder_id"].values[0]
508    methods_zip_file_id = df_workspace["methods_zip_file_id"].values[0]
509
510    if gdrive_folder_id is not None and methods_zip_file_id is not None:
511        if gdrive_folder_id != "None" and methods_zip_file_id != "None":
512            if gdrive_folder_id != "" and methods_zip_file_id != "":
513                return True
514
515    return False

Checks whether Google Drive sync is enabled simply by querying whether Google Drive ID's exist in the database.

Typically used for separating sync-specific functionality.

Returns:

bool: Whether Google Drive sync is enabled or not

def email_notifications_are_enabled():
518def email_notifications_are_enabled():
519
520    """
521    Checks whether email notifications are enabled.
522
523    Returns True if databases are valid, Google Drive sync is enabled, and if email addresses were
524    registered by user in Settings > General. Returns False if any condition is not met.
525
526    Returns:
527        bool: True if email notifications are enabled, False if not
528    """
529
530    if not is_valid():
531        return False
532
533    if not sync_is_enabled():
534        return False
535
536    if len(get_table("Settings", "email_notifications")) > 0:
537        return True
538
539    return False

Checks whether email notifications are enabled.

Returns True if databases are valid, Google Drive sync is enabled, and if email addresses were registered by user in Settings > General. Returns False if any condition is not met.

Returns:

bool: True if email notifications are enabled, False if not

def slack_notifications_are_enabled():
542def slack_notifications_are_enabled():
543
544    """
545    Checks whether Slack notifications are enabled.
546
547    Returns True if user enabled Slack notifications in Settings > General, and False if not.
548
549    Returns:
550        bool: True if Slack notifications are enabled, False if not
551    """
552
553    if not is_valid():
554        return False
555
556    try:
557        return bool(get_table("Settings", "workspace")["slack_enabled"].astype(int).tolist()[0])
558    except:
559        return False

Checks whether Slack notifications are enabled.

Returns True if user enabled Slack notifications in Settings > General, and False if not.

Returns:

bool: True if Slack notifications are enabled, False if not

def is_instrument_computer():
562def is_instrument_computer():
563
564    """
565    Checks whether user's device is the instrument computer.
566
567    This is specified during setup. If the user created a new instrument, or signed in as an instrument device, then
568    this will return True. If the user signed in to their workspace from a non-instrument device, this will return False.
569
570    Typically used to organize / hide UI functions for instrument and non-instrument devices
571    that MS-AutoQC is installed on.
572
573    Returns:
574        True if device is instrument computer, False if not
575    """
576
577    return bool(get_table("Settings", "workspace")["is_instrument_computer"].astype(int).tolist()[0])

Checks whether user's device is the instrument computer.

This is specified during setup. If the user created a new instrument, or signed in as an instrument device, then this will return True. If the user signed in to their workspace from a non-instrument device, this will return False.

Typically used to organize / hide UI functions for instrument and non-instrument devices that MS-AutoQC is installed on.

Returns:

True if device is instrument computer, False if not

def get_md5_for_settings_db():
580def get_md5_for_settings_db():
581
582    """
583    Calculates and returns MD5 checksum for the settings database file.
584
585    Typically used for checking whether the user changed settings and prompting a Google Drive sync (if sync is enabled).
586
587    Returns:
588        An MD5 checksum of /data/methods/Settings.db
589    """
590
591    hash_md5 = hashlib.md5()
592
593    with open(settings_db_file, "rb") as f:
594        for chunk in iter(lambda: f.read(4096), b""):
595            hash_md5.update(chunk)
596
597    return hash_md5.hexdigest()

Calculates and returns MD5 checksum for the settings database file.

Typically used for checking whether the user changed settings and prompting a Google Drive sync (if sync is enabled).

Returns:

An MD5 checksum of /data/methods/Settings.db

def settings_were_modified(md5_checksum):
600def settings_were_modified(md5_checksum):
601
602    """
603    Checks whether settings database file has been modified.
604
605    This is done by comparing the checksum computed when Settings were opened (given as a parameter)
606    with the checksum computed when Settings were closed (in this function call).
607
608    Args:
609        md5_checksum (str):
610            An MD5 checksum of /data/methods/Settings.db that was computed when the user opened Settings in the app
611
612    Returns:
613        bool: True if checksums don't match, False if checksums match.
614    """
615
616    if md5_checksum != get_md5_for_settings_db():
617        return True
618    else:
619        return False

Checks whether settings database file has been modified.

This is done by comparing the checksum computed when Settings were opened (given as a parameter) with the checksum computed when Settings were closed (in this function call).

Arguments:
  • md5_checksum (str): An MD5 checksum of /data/methods/Settings.db that was computed when the user opened Settings in the app
Returns:

bool: True if checksums don't match, False if checksums match.

def zip_database(instrument_id=None, filename=None):
622def zip_database(instrument_id=None, filename=None):
623
624    """
625    Compresses instrument database file into a ZIP archive in /data directory.
626
627    Used for fast downloads / uploads over network connections to Google Drive (if Google Drive sync is enabled).
628
629    The zip archive is accessible by filename and path in the /data directory. For example, zipping
630    the database for "Thermo QE 1" will generate a zip file with path "../data/Thermo_QE_1.zip".
631
632    Args:
633        instrument_id (str, default None):
634            If specified, selects a database to zip by instrument ID (ex: "Thermo QE 1")
635        filename (str, default None):
636            If specified, selects a database to zip by filename (ex: "Thermo_QE_1.zip")
637
638    Returns:
639        None
640    """
641
642    if instrument_id is None and filename is None:
643        return None
644
645    if filename is not None:
646        db_zip_file = os.path.join(data_directory, filename)
647        filename = filename.replace(".zip", ".db")
648
649    elif instrument_id is not None:
650        db_zip_file = get_database_file(instrument_id, zip=True)
651        filename = instrument_id.replace(" ", "_") + ".db"
652
653    file_without_extension = db_zip_file.replace(".zip", "")
654    shutil.make_archive(file_without_extension, "zip", data_directory, filename)

Compresses instrument database file into a ZIP archive in /data directory.

Used for fast downloads / uploads over network connections to Google Drive (if Google Drive sync is enabled).

The zip archive is accessible by filename and path in the /data directory. For example, zipping the database for "Thermo QE 1" will generate a zip file with path "../data/Thermo_QE_1.zip".

Arguments:
  • instrument_id (str, default None): If specified, selects a database to zip by instrument ID (ex: "Thermo QE 1")
  • filename (str, default None): If specified, selects a database to zip by filename (ex: "Thermo_QE_1.zip")
Returns:

None

def unzip_database(instrument_id=None, filename=None):
657def unzip_database(instrument_id=None, filename=None):
658
659    """
660    Unzips ZIP archive containing instrument database file and deletes the archive when complete.
661
662    Args:
663        instrument_id (str, default None):
664            If specified, selects a database to zip by instrument ID (ex: "Thermo QE 1")
665        filename (str, default None):
666            If specified, selects a database to zip by filename (ex: "Thermo_QE_1.zip")
667
668    Returns:
669        None
670    """
671
672    if instrument_id is None and filename is None:
673        return None
674
675    if instrument_id is not None:
676        db_zip_file = get_database_file(instrument_id, zip=True)
677    elif filename is not None:
678        db_zip_file = os.path.join(data_directory, filename)
679
680    shutil.unpack_archive(db_zip_file, data_directory, "zip")
681    os.remove(db_zip_file)

Unzips ZIP archive containing instrument database file and deletes the archive when complete.

Arguments:
  • instrument_id (str, default None): If specified, selects a database to zip by instrument ID (ex: "Thermo QE 1")
  • filename (str, default None): If specified, selects a database to zip by filename (ex: "Thermo_QE_1.zip")
Returns:

None

def zip_methods():
684def zip_methods():
685
686    """
687    Compresses methods directory into a ZIP archive in /data directory.
688
689    Returns:
690        Path for zip archive of methods directory (ex: "../data/methods.zip")
691    """
692
693    output_directory_and_name = os.path.join(data_directory, "methods.zip").replace(".zip", "")
694    shutil.make_archive(output_directory_and_name, "zip", methods_directory)
695    return output_directory_and_name + ".zip"

Compresses methods directory into a ZIP archive in /data directory.

Returns:

Path for zip archive of methods directory (ex: "../data/methods.zip")

def unzip_methods():
698def unzip_methods():
699
700    """
701    Unzips ZIP archive containing methods directory and deletes the archive when complete.
702    """
703
704    input_zip = os.path.join(data_directory, "methods.zip")
705    shutil.unpack_archive(input_zip, methods_directory, "zip")
706    os.remove(input_zip)

Unzips ZIP archive containing methods directory and deletes the archive when complete.

def zip_csv_files(input_directory, output_directory_and_name):
709def zip_csv_files(input_directory, output_directory_and_name):
710
711    """
712    Compresses CSV files into a ZIP archive in /data directory.
713
714    Used for fast upload of instrument run data to Google Drive during an active instrument run (if Google Drive sync is enabled).
715
716    Args:
717        input_directory (str):
718            The temporary directory for files pertaining to an instrument run, denoted as "Instrument_ID_Run_ID".
719            For example, a job with ID "BRDE001" created under instrument with ID "Thermo QE 1" would have its files
720            stored in "/data/Thermo_QE_1_BRDE001".
721        output_directory_and_name (str):
722            Essentially, the file path for the ZIP archive (ex: "/data/Instrument_ID_Run_ID").
723
724    Returns:
725        Path for zip archive of CSV files with instrument run data (ex: "../data/Instrument_ID_Run_ID.zip")
726    """
727
728    shutil.make_archive(output_directory_and_name, "zip", input_directory)
729    return output_directory_and_name + ".zip"

Compresses CSV files into a ZIP archive in /data directory.

Used for fast upload of instrument run data to Google Drive during an active instrument run (if Google Drive sync is enabled).

Arguments:
  • input_directory (str): The temporary directory for files pertaining to an instrument run, denoted as "Instrument_ID_Run_ID". For example, a job with ID "BRDE001" created under instrument with ID "Thermo QE 1" would have its files stored in "/data/Thermo_QE_1_BRDE001".
  • output_directory_and_name (str): Essentially, the file path for the ZIP archive (ex: "/data/Instrument_ID_Run_ID").
Returns:

Path for zip archive of CSV files with instrument run data (ex: "../data/Instrument_ID_Run_ID.zip")

def unzip_csv_files(input_zip, output_directory):
732def unzip_csv_files(input_zip, output_directory):
733
734    """
735    Unzips ZIP archive of CSV files and deletes the archive upon completion.
736    """
737
738    shutil.unpack_archive(input_zip, output_directory, "zip")
739    os.remove(input_zip)

Unzips ZIP archive of CSV files and deletes the archive upon completion.

def get_table(database_name, table_name):
742def get_table(database_name, table_name):
743
744    """
745    Retrieves table from database as a pandas DataFrame object.
746
747    TODO: Improve this function to accept column and record queries
748
749    Args:
750        database_name (str):
751            The database to query, using instrument ID or "Settings"
752        table_name (str):
753            The table to retrieve
754
755    Returns:
756        DataFrame of table.
757    """
758
759    if database_name == "Settings":
760        database = settings_database
761    else:
762        database = get_database_file(database_name, sqlite_conn=True)
763
764    engine = sa.create_engine(database)
765    return pd.read_sql("SELECT * FROM " + table_name, engine)

Retrieves table from database as a pandas DataFrame object.

TODO: Improve this function to accept column and record queries

Arguments:
  • database_name (str): The database to query, using instrument ID or "Settings"
  • table_name (str): The table to retrieve
Returns:

DataFrame of table.

def generate_client_settings_yaml(client_id, client_secret):
768def generate_client_settings_yaml(client_id, client_secret):
769
770    """
771    Generates a settings.yaml file for Google authentication in the /auth directory.
772
773    Client ID and client secret are generated and provided by the user in the Google Cloud Console.
774
775    See: https://docs.iterative.ai/PyDrive2/oauth/#automatic-and-custom-authentication-with-settings-yaml
776
777    Args:
778        client_id (str):
779            The Client ID of the MS-AutoQC application, generated and provided by the user
780        client_secret (str):
781            The Client Secret of the MS-AutoQC application, generated and provided by the user
782    Returns:
783        None
784    """
785
786    auth_directory = os.path.join(os.getcwd(), "auth")
787    if not os.path.exists(auth_directory):
788        os.makedirs(auth_directory)
789
790    settings_yaml_file = os.path.join(auth_directory, "settings.yaml")
791
792    lines = [
793        "client_config_backend: settings",
794        "client_config:",
795        "  client_id: " + client_id,
796        "  client_secret: " + client_secret,
797        "\n",
798        "save_credentials: True",
799        "save_credentials_backend: file",
800        "save_credentials_file: auth/credentials.txt",
801        "\n",
802        "get_refresh_token: True",
803        "\n",
804        "oauth_scope:",
805        "  - https://www.googleapis.com/auth/drive",
806        "  - https://www.googleapis.com/auth/gmail.send",
807        "  - https://www.googleapis.com/auth/userinfo.email"
808    ]
809
810    with open(settings_yaml_file, "w") as file:
811        for line in lines:
812            file.write(line)
813            if line != "\n" and line != lines[-1]:
814                file.write("\n")

Generates a settings.yaml file for Google authentication in the /auth directory.

Client ID and client secret are generated and provided by the user in the Google Cloud Console.

See: https://docs.iterative.ai/PyDrive2/oauth/#automatic-and-custom-authentication-with-settings-yaml

Arguments:
  • client_id (str): The Client ID of the MS-AutoQC application, generated and provided by the user
  • client_secret (str): The Client Secret of the MS-AutoQC application, generated and provided by the user
Returns:

None

def insert_google_drive_ids( instrument_id, gdrive_folder_id, instrument_db_file_id, methods_zip_file_id):
817def insert_google_drive_ids(instrument_id, gdrive_folder_id, instrument_db_file_id, methods_zip_file_id):
818
819    """
820    Inserts Google Drive ID's into corresponding tables to enable Google Drive sync.
821
822    This function is called when a user creates a new instrument in their workspace.
823
824    The ID's for the following files / folders in Google Drive are stored in the database:
825    1. MS-AutoQC folder
826    2. Instrument database zip file
827    3. Methods directory zip file
828
829    Args:
830        instrument_id (str):
831            Instrument ID
832        gdrive_folder_id (str):
833            Google Drive ID for the MS-AutoQC folder (found in the user's root directory in Drive)
834        instrument_db_file_id (str):
835            Google Drive ID for the instrument database ZIP file
836        methods_zip_file_id (str):
837            Google Drive ID for the methods directory ZIP file
838
839    Returns:
840        None
841    """
842
843    db_metadata, connection = connect_to_database("Settings")
844    instruments_table = sa.Table("instruments", db_metadata, autoload=True)
845    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
846
847    # Instruments database
848    connection.execute((
849        sa.update(instruments_table)
850            .where((instruments_table.c.name == instrument_id))
851            .values(drive_id=instrument_db_file_id)
852    ))
853
854    # MS-AutoQC folder and Methods folder
855    connection.execute((
856        sa.update(workspace_table)
857            .where((workspace_table.c.id == 1))
858            .values(gdrive_folder_id=gdrive_folder_id,
859                    methods_zip_file_id=methods_zip_file_id)
860    ))
861
862    connection.close()

Inserts Google Drive ID's into corresponding tables to enable Google Drive sync.

This function is called when a user creates a new instrument in their workspace.

The ID's for the following files / folders in Google Drive are stored in the database:

  1. MS-AutoQC folder
  2. Instrument database zip file
  3. Methods directory zip file
Arguments:
  • instrument_id (str): Instrument ID
  • gdrive_folder_id (str): Google Drive ID for the MS-AutoQC folder (found in the user's root directory in Drive)
  • instrument_db_file_id (str): Google Drive ID for the instrument database ZIP file
  • methods_zip_file_id (str): Google Drive ID for the methods directory ZIP file
Returns:

None

def insert_new_instrument(name, vendor):
865def insert_new_instrument(name, vendor):
866
867    """
868    Inserts a new instrument into the "instruments" table in the Settings database.
869
870    The name is the instrument ID, and the vendor is one of 5 options: Thermo Fisher, Agilent, Bruker, Sciex, and Waters.
871
872    Args:
873        name (str):
874            Instrument ID
875        vendor (str):
876            Instrument vendor
877
878    Returns:
879        None
880    """
881
882    # Connect to database
883    db_metadata, connection = connect_to_database("Settings")
884
885    # Get "instruments" table
886    instruments_table = sa.Table("instruments", db_metadata, autoload=True)
887
888    # Prepare insert of new instrument
889    insert_instrument = instruments_table.insert().values(
890        {"name": name,
891         "vendor": vendor}
892    )
893
894    # Execute the insert, then close the connection
895    connection.execute(insert_instrument)
896    connection.close()

Inserts a new instrument into the "instruments" table in the Settings database.

The name is the instrument ID, and the vendor is one of 5 options: Thermo Fisher, Agilent, Bruker, Sciex, and Waters.

Arguments:
  • name (str): Instrument ID
  • vendor (str): Instrument vendor
Returns:

None

def get_instruments_list():
899def get_instruments_list():
900
901    """
902    Returns list of instruments in database.
903    """
904
905    # Connect to SQLite database
906    engine = sa.create_engine(settings_database)
907
908    # Get instruments table as DataFrame
909    df_instruments = pd.read_sql("SELECT * FROM instruments", engine)
910
911    # Return list of instruments
912    return df_instruments["name"].astype(str).tolist()

Returns list of instruments in database.

def get_instrument(instrument_id):
915def get_instrument(instrument_id):
916
917    """
918    Returns record from "instruments" table as a DataFrame for a given instrument
919
920    Args:
921        instrument_id (str): Instrument ID
922
923    Returns:
924        DataFrame containing the name, vendor, and drive_id for the given instrument
925    """
926
927    engine = sa.create_engine(settings_database)
928    return pd.read_sql("SELECT * FROM instruments WHERE name = '" + instrument_id + "'", engine)

Returns record from "instruments" table as a DataFrame for a given instrument

Arguments:
  • instrument_id (str): Instrument ID
Returns:

DataFrame containing the name, vendor, and drive_id for the given instrument

def get_filenames_from_sequence(sequence, vendor='Thermo Fisher'):
931def get_filenames_from_sequence(sequence, vendor="Thermo Fisher"):
932
933    """
934    Filters preblanks, washes, and shutdown injections from sequence file, and simultaneously assigns
935    polariy to each sample based on presence of "Pos" or "Neg" in Instrument Method column.
936
937    This function is called upon starting a new QC job.
938
939    TODO: Adapt this function for other instrument vendors.
940    TODO: Check the method filename, not entire file path, for "Pos" and "Neg".
941        A folder containing "Pos" or "Neg" will give incorrect polarity assignments.
942
943    Args:
944        sequence (str):
945            The acquisition sequence file, encoded as a JSON string in "split" format
946        vendor (str):
947            The instrument vendor (see to-do statements)
948
949    Returns:
950        DataFrame of acquisition sequence, with preblanks / washes / shutdowns filtered out and polarities assigned
951    """
952
953    df_sequence = pd.read_json(sequence, orient="split")
954
955    # Filter out preblanks
956    df_sequence = df_sequence.loc[
957        ~((df_sequence["File Name"].str.contains(r"_BK_", na=False)) &
958          (df_sequence["File Name"].str.contains(r"_pre_", na=False)))]
959
960    # Filter out wash and shutdown
961    df_sequence = df_sequence.loc[
962        ~(df_sequence["File Name"].str.contains(r"_wash_", na=False)) &
963        ~(df_sequence["File Name"].str.contains(r"shutdown", na=False))]
964
965    # Derive polarity from instrument method filename
966    df_sequence.loc[df_sequence["Instrument Method"].str.contains(r"Pos", na=False), "Polarity"] = "Pos"
967    df_sequence.loc[df_sequence["Instrument Method"].str.contains(r"Neg", na=False), "Polarity"] = "Neg"
968
969    return df_sequence

Filters preblanks, washes, and shutdown injections from sequence file, and simultaneously assigns polariy to each sample based on presence of "Pos" or "Neg" in Instrument Method column.

This function is called upon starting a new QC job.

TODO: Adapt this function for other instrument vendors. TODO: Check the method filename, not entire file path, for "Pos" and "Neg". A folder containing "Pos" or "Neg" will give incorrect polarity assignments.

Arguments:
  • sequence (str): The acquisition sequence file, encoded as a JSON string in "split" format
  • vendor (str): The instrument vendor (see to-do statements)
Returns:

DataFrame of acquisition sequence, with preblanks / washes / shutdowns filtered out and polarities assigned

def get_polarity_for_sample(instrument_id, run_id, sample_id, status):
 972def get_polarity_for_sample(instrument_id, run_id, sample_id, status):
 973
 974    """
 975    Returns polarity for a given sample.
 976
 977    TODO: Loading hundreds of rows of data before querying for one sample is massively inefficient.
 978        This function was written in haste and can be easily implemented in a much better way.
 979
 980    Args:
 981        instrument_id (str): Instrument ID
 982        run_id (str): Instrument run ID (job ID)
 983        sample_id (str): Sample ID
 984        status (str): Job status
 985
 986    Returns:
 987        Polarity for the given sample, as either "Pos" or "Neg".
 988    """
 989
 990    if get_device_identity() != instrument_id and sync_is_enabled():
 991        if status == "Complete":
 992            df = get_samples_in_run(instrument_id, run_id, "Both")
 993        elif status == "Active":
 994            df = get_samples_from_csv(instrument_id, run_id, "Both")
 995    else:
 996        df = get_samples_in_run(instrument_id, run_id, "Both")
 997
 998    try:
 999        polarity = df.loc[df["sample_id"] == sample_id]["polarity"].astype(str).values[0]
1000    except:
1001        print("Could not find polarity for sample in database.")
1002        polarity = "Neg" if "Neg" in sample_id else "Pos"
1003
1004    return polarity

Returns polarity for a given sample.

TODO: Loading hundreds of rows of data before querying for one sample is massively inefficient. This function was written in haste and can be easily implemented in a much better way.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • sample_id (str): Sample ID
  • status (str): Job status
Returns:

Polarity for the given sample, as either "Pos" or "Neg".

def insert_new_run( run_id, instrument_id, chromatography, bio_standards, path, sequence, metadata, qc_config_id, job_type):
1007def insert_new_run(run_id, instrument_id, chromatography, bio_standards, path, sequence, metadata, qc_config_id, job_type):
1008
1009    """
1010    Initializes sample records in database for a new QC job.
1011
1012    Performs the following functions:
1013        1. Inserts a record for the new instrument run into the "runs" table
1014        2. Inserts sample rows into the "sample_qc_results" table
1015        3. Inserts biological standard sample rows into the "bio_qc_results" table
1016
1017    Args:
1018        run_id (str):
1019            Instrument run ID (job ID)
1020        instrument_id (str):
1021            Instrument ID
1022        chromatography (str):
1023            Chromatography method
1024        bio_standards (str):
1025            Biological standards
1026        path (str):
1027            Data acquisition path
1028        sequence (str):
1029            Acquisition sequence table, as JSON string in "records" format
1030        metadata (str):
1031            Sample metadata table, as JSON string in "records" format
1032        qc_config_id (str):
1033            Name of QC configuration
1034        job_type (str):
1035            Either "completed" or "active"
1036
1037    Returns:
1038        None
1039    """
1040
1041    # Get list of samples from sequence
1042    df_sequence = get_filenames_from_sequence(sequence)
1043
1044    samples = df_sequence["File Name"].astype(str).tolist()
1045    polarities = df_sequence["Polarity"].astype(str).tolist()
1046    positions = df_sequence["Position"].astype(str).tolist()
1047
1048    num_samples = len(samples)
1049
1050    # Connect to database
1051    db_metadata, connection = connect_to_database(instrument_id)
1052
1053    # Get relevant tables
1054    runs_table = sa.Table("runs", db_metadata, autoload=True)
1055    sample_qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1056    bio_qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1057
1058    # Get identifiers for biological standard (if any)
1059    identifiers = get_biological_standard_identifiers(bio_standards)
1060
1061    # Prepare insert of user-inputted run data
1062    insert_run = runs_table.insert().values(
1063        {"run_id": run_id,
1064         "chromatography": chromatography,
1065         "acquisition_path": path,
1066         "sequence": sequence,
1067         "metadata": metadata,
1068         "status": "Active",
1069         "samples": num_samples,
1070         "completed": 0,
1071         "passes": 0,
1072         "fails": 0,
1073         "qc_config_id": qc_config_id,
1074         "biological_standards": str(bio_standards),
1075         "job_type": job_type})
1076
1077    insert_samples = []
1078
1079    for index, sample in enumerate(samples):
1080        # Check if the biological standard identifier is in the sample name
1081        is_bio_standard = False
1082
1083        for identifier in identifiers.keys():
1084            if identifier in sample:
1085                is_bio_standard = True
1086                break
1087
1088        # Prepare insert of the sample row into the "sample_qc_results" table
1089        if not is_bio_standard:
1090            insert_sample = sample_qc_results_table.insert().values(
1091                {"sample_id": sample,
1092                 "run_id": run_id,
1093                 "polarity": polarities[index],
1094                 "position": positions[index]})
1095
1096        # Prepare insert of the sample row into the "bio_qc_results" table
1097        else:
1098            insert_sample = bio_qc_results_table.insert().values(
1099                {"sample_id": sample,
1100                 "run_id": run_id,
1101                 "polarity": polarities[index],
1102                 "biological_standard": identifiers[identifier],
1103                 "position": positions[index]})
1104
1105        # Add this INSERT query into the list of insert queries
1106        insert_samples.append(insert_sample)
1107
1108    # Execute INSERT to database
1109    connection.execute(insert_run)
1110
1111    for insert_sample in insert_samples:
1112        connection.execute(insert_sample)
1113
1114    # Close the connection
1115    connection.close()

Initializes sample records in database for a new QC job.

Performs the following functions:
  1. Inserts a record for the new instrument run into the "runs" table
  2. Inserts sample rows into the "sample_qc_results" table
  3. Inserts biological standard sample rows into the "bio_qc_results" table
Arguments:
  • run_id (str): Instrument run ID (job ID)
  • instrument_id (str): Instrument ID
  • chromatography (str): Chromatography method
  • bio_standards (str): Biological standards
  • path (str): Data acquisition path
  • sequence (str): Acquisition sequence table, as JSON string in "records" format
  • metadata (str): Sample metadata table, as JSON string in "records" format
  • qc_config_id (str): Name of QC configuration
  • job_type (str): Either "completed" or "active"
Returns:

None

def get_instrument_run(instrument_id, run_id):
1118def get_instrument_run(instrument_id, run_id):
1119
1120    """
1121    Returns DataFrame of given instrument run from "runs" table.
1122
1123    Args:
1124        instrument_id (str): Instrument ID
1125        run_id (str): Run ID
1126
1127    Returns:
1128        DataFrame containing record for instrument run
1129    """
1130
1131    database = get_database_file(instrument_id=instrument_id, sqlite_conn=True)
1132    engine = sa.create_engine(database)
1133    query = "SELECT * FROM runs WHERE run_id = '" + run_id + "'"
1134    df_instrument_run = pd.read_sql(query, engine)
1135    return df_instrument_run

Returns DataFrame of given instrument run from "runs" table.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Run ID
Returns:

DataFrame containing record for instrument run

def get_instrument_run_from_csv(instrument_id, run_id):
1138def get_instrument_run_from_csv(instrument_id, run_id):
1139
1140    """
1141    Returns DataFrame of selected instrument run from CSV files during active instrument runs.
1142
1143    This function is called when a user views an active instrument run from an external device
1144    (to prevent downloading / uploading the database file with each sample acquisition).
1145
1146    Args:
1147        instrument_id (str): Instrument ID
1148        run_id (str): Run ID
1149
1150    Returns:
1151        DataFrame containing record for instrument run
1152    """
1153
1154    id = instrument_id.replace(" ", "_") + "_" + run_id
1155    run_csv_file = os.path.join(data_directory, id, "csv", "run.csv")
1156    return pd.read_csv(run_csv_file, index_col=False)

Returns DataFrame of selected instrument run from CSV files during active instrument runs.

This function is called when a user views an active instrument run from an external device (to prevent downloading / uploading the database file with each sample acquisition).

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Run ID
Returns:

DataFrame containing record for instrument run

def get_instrument_runs(instrument_id, as_list=False):
1159def get_instrument_runs(instrument_id, as_list=False):
1160
1161    """
1162    Returns DataFrame of all runs on a given instrument from "runs" table
1163
1164    Args:
1165        instrument_id (str):
1166            Instrument ID
1167        as_list (str, default False):
1168            If True, returns only a list of names of instrument runs (jobs)
1169
1170    Returns:
1171        DataFrame containing records for instrument runs (QC jobs) for the given instrument
1172    """
1173
1174    database = get_database_file(instrument_id, sqlite_conn=True)
1175    engine = sa.create_engine(database)
1176    df = pd.read_sql("SELECT * FROM runs", engine)
1177
1178    if as_list:
1179        return df["run_id"].astype(str).tolist()
1180    else:
1181        return df

Returns DataFrame of all runs on a given instrument from "runs" table

Arguments:
  • instrument_id (str): Instrument ID
  • as_list (str, default False): If True, returns only a list of names of instrument runs (jobs)
Returns:

DataFrame containing records for instrument runs (QC jobs) for the given instrument

def delete_instrument_run(instrument_id, run_id):
1184def delete_instrument_run(instrument_id, run_id):
1185
1186    """
1187    Deletes all records for an instrument run (QC job) from the database.
1188
1189    Args:
1190        instrument_id (str): Instrument ID
1191        run_id (str): Run ID
1192
1193    Returns:
1194        None
1195    """
1196
1197    # Connect to database
1198    db_metadata, connection = connect_to_database(instrument_id)
1199
1200    # Get relevant tables
1201    runs_table = sa.Table("runs", db_metadata, autoload=True)
1202    sample_qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1203    bio_qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1204
1205    # Delete from each table
1206    for table in [runs_table, sample_qc_results_table, bio_qc_results_table]:
1207        connection.execute((
1208            sa.delete(table).where(table.c.run_id == run_id)
1209        ))
1210
1211    # Close the connection
1212    connection.close()

Deletes all records for an instrument run (QC job) from the database.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Run ID
Returns:

None

def get_acquisition_path(instrument_id, run_id):
1215def get_acquisition_path(instrument_id, run_id):
1216
1217    """
1218    Retrieves acquisition path for a given instrument run.
1219
1220    Args:
1221        instrument_id (str): Instrument ID
1222        run_id (str): Run ID
1223
1224    Returns:
1225        Acquisition path for the given instrument run
1226    """
1227
1228    return get_instrument_run(instrument_id, run_id)["acquisition_path"].astype(str).tolist()[0]

Retrieves acquisition path for a given instrument run.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Run ID
Returns:

Acquisition path for the given instrument run

def get_md5(instrument_id, sample_id):
1231def get_md5(instrument_id, sample_id):
1232
1233    """
1234    Returns MD5 checksum for a data file in "sample_qc_results" table.
1235
1236    Used for comparing MD5 checksums during active instrument runs.
1237
1238    TODO: This function will return incorrect results if two different instrument runs
1239        have samples with the same sample ID. It needs to include "run_id" in the SQL query.
1240
1241    Args:
1242        instrument_id (str): Instrument ID
1243        sample_id (str): Sample ID
1244
1245    Returns:
1246        MD5 checksum stored for the data file.
1247    """
1248
1249    # Connect to database
1250    database = get_database_file(instrument_id, sqlite_conn=True)
1251    engine = sa.create_engine(database)
1252
1253    # Check if sample is a biological standard
1254    table = "sample_qc_results"
1255
1256    for identifier in get_biological_standard_identifiers().keys():
1257        if identifier in sample_id:
1258            table = "bio_qc_results"
1259            break
1260
1261    # Get sample from correct table
1262    df_sample_qc_results = pd.read_sql(
1263        "SELECT * FROM " + table + " WHERE sample_id = '" + sample_id + "'", engine)
1264
1265    return df_sample_qc_results["md5"].astype(str).values[0]

Returns MD5 checksum for a data file in "sample_qc_results" table.

Used for comparing MD5 checksums during active instrument runs.

TODO: This function will return incorrect results if two different instrument runs have samples with the same sample ID. It needs to include "run_id" in the SQL query.

Arguments:
  • instrument_id (str): Instrument ID
  • sample_id (str): Sample ID
Returns:

MD5 checksum stored for the data file.

def update_md5_checksum(instrument_id, sample_id, md5_checksum):
1268def update_md5_checksum(instrument_id, sample_id, md5_checksum):
1269
1270    """
1271    Updates MD5 checksum for a data file during sample acquisition.
1272
1273    TODO: This function will return incorrect results if two different instrument runs
1274        have samples with the same sample ID. It needs to include "run_id" in the SQL query.
1275
1276    Args:
1277        instrument_id (str):
1278            Instrument ID
1279        sample_id (str):
1280            Sample ID (filename) of data file
1281        md5_checksum (str):
1282            MD5 checksum for the sample data file
1283
1284    Returns:
1285        None
1286    """
1287
1288    # Connect to database
1289    db_metadata, connection = connect_to_database(instrument_id)
1290
1291    # Check if sample is a biological standard and get relevant table
1292    qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1293
1294    for identifier in get_biological_standard_identifiers().keys():
1295        if identifier in sample_id:
1296            qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1297            break
1298
1299    # Prepare update of MD5 checksum at sample row
1300    update_md5 = (
1301        sa.update(qc_results_table)
1302            .where(qc_results_table.c.sample_id == sample_id)
1303            .values(md5=md5_checksum)
1304    )
1305
1306    # Execute UPDATE into database, then close the connection
1307    connection.execute(update_md5)
1308    connection.close()

Updates MD5 checksum for a data file during sample acquisition.

TODO: This function will return incorrect results if two different instrument runs have samples with the same sample ID. It needs to include "run_id" in the SQL query.

Arguments:
  • instrument_id (str): Instrument ID
  • sample_id (str): Sample ID (filename) of data file
  • md5_checksum (str): MD5 checksum for the sample data file
Returns:

None

def write_qc_results( sample_id, instrument_id, run_id, json_mz, json_rt, json_intensity, qc_dataframe, qc_result, is_bio_standard):
1311def write_qc_results(sample_id, instrument_id, run_id, json_mz, json_rt, json_intensity, qc_dataframe, qc_result, is_bio_standard):
1312
1313    """
1314    Writes QC results (as dictionary records) to sample record upon MS-DIAL processing completion.
1315
1316    QC results consist of m/z, RT, and intensity data for internal standards (or targeted metabolites in biological standards),
1317    as well as a DataFrame containing delta m/z, delta RT, in-run delta RT, warnings, and fails (qc_dataframe) and overall QC result
1318    (which will be "Pass" or "Fail").
1319
1320    The data is encoded as dictionary in "records" format: [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}].
1321    This dictionary is cast to a string before being passed to this function.
1322
1323    TODO: Update names of arguments from json_x to record_x, as the data is no longer encoded as JSON strings.
1324        The data is now encoded in "records" format as a string.
1325
1326    Args:
1327        sample_id (str):
1328            Sample ID
1329        instrument_id (str):
1330            Instrument ID
1331        run_id (str):
1332            Instrument run ID (Job ID)
1333        json_mz (str):
1334            String dict of internal standard m/z data in "records" format
1335        json_rt (str):
1336            String dict of internal standard RT data in "records" format
1337        json_intensity (str):
1338            String dict of internal standard intensity data in "records" format
1339        qc_dataframe (str):
1340            String dict of various QC data in "records" format
1341        qc_result (str):
1342            QC result for sample, either "Pass" or "Fail"
1343        is_bio_standard (bool):
1344            Whether the sample is a biological standard
1345
1346    Returns:
1347        None
1348    """
1349
1350    # Connect to database
1351    db_metadata, connection = connect_to_database(instrument_id)
1352
1353    # Get "sample_qc_results" or "bio_qc_results" table
1354    if not is_bio_standard:
1355        qc_results_table = sa.Table("sample_qc_results", db_metadata, autoload=True)
1356    else:
1357        qc_results_table = sa.Table("bio_qc_results", db_metadata, autoload=True)
1358
1359    # Prepare update (insert) of QC results to correct sample row
1360    update_qc_results = (
1361        sa.update(qc_results_table)
1362            .where((qc_results_table.c.sample_id == sample_id)
1363                   & (qc_results_table.c.run_id == run_id))
1364            .values(precursor_mz=json_mz,
1365                    retention_time=json_rt,
1366                    intensity=json_intensity,
1367                    qc_dataframe=qc_dataframe,
1368                    qc_result=qc_result)
1369    )
1370
1371    # Execute UPDATE into database, then close the connection
1372    connection.execute(update_qc_results)
1373    connection.close()

Writes QC results (as dictionary records) to sample record upon MS-DIAL processing completion.

QC results consist of m/z, RT, and intensity data for internal standards (or targeted metabolites in biological standards), as well as a DataFrame containing delta m/z, delta RT, in-run delta RT, warnings, and fails (qc_dataframe) and overall QC result (which will be "Pass" or "Fail").

The data is encoded as dictionary in "records" format: [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]. This dictionary is cast to a string before being passed to this function.

TODO: Update names of arguments from json_x to record_x, as the data is no longer encoded as JSON strings. The data is now encoded in "records" format as a string.

Arguments:
  • sample_id (str): Sample ID
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (Job ID)
  • json_mz (str): String dict of internal standard m/z data in "records" format
  • json_rt (str): String dict of internal standard RT data in "records" format
  • json_intensity (str): String dict of internal standard intensity data in "records" format
  • qc_dataframe (str): String dict of various QC data in "records" format
  • qc_result (str): QC result for sample, either "Pass" or "Fail"
  • is_bio_standard (bool): Whether the sample is a biological standard
Returns:

None

def get_chromatography_methods():
1376def get_chromatography_methods():
1377
1378    """
1379    Returns DataFrame of chromatography methods from the Settings database.
1380    """
1381
1382    engine = sa.create_engine(settings_database)
1383    df_methods = pd.read_sql("SELECT * FROM chromatography_methods", engine)
1384    return df_methods

Returns DataFrame of chromatography methods from the Settings database.

def get_chromatography_methods_list():
1387def get_chromatography_methods_list():
1388
1389    """
1390    Returns list of chromatography method ID's from the Settings database.
1391    """
1392
1393    df_methods = get_chromatography_methods()
1394    return df_methods["method_id"].astype(str).tolist()

Returns list of chromatography method ID's from the Settings database.

def insert_chromatography_method(method_id):
1397def insert_chromatography_method(method_id):
1398
1399    """
1400    Inserts new chromatography method in the "chromatography_methods" table of the Settings database.
1401
1402    Args:
1403        method_id (str): Name of the chromatography method
1404
1405    Returns:
1406        None
1407    """
1408
1409    # Connect to database
1410    db_metadata, connection = connect_to_database("Settings")
1411
1412    # Get "chromatography_methods" table and "biological_standards" table
1413    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1414    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
1415
1416    # Execute insert of chromatography method
1417    insert_method = chromatography_table.insert().values(
1418        {"method_id": method_id,
1419         "num_pos_standards": 0,
1420         "num_neg_standards": 0,
1421         "pos_istd_msp_file": "",
1422         "neg_istd_msp_file": "",
1423         "pos_parameter_file": "",
1424         "neg_parameter_file": "",
1425         "msdial_config_id": "Default"})
1426
1427    connection.execute(insert_method)
1428
1429    # Execute insert of method for each biological standard
1430    df_biological_standards = get_biological_standards()
1431    biological_standards = df_biological_standards["name"].astype(str).unique().tolist()
1432    identifiers = df_biological_standards["identifier"].astype(str).tolist()
1433
1434    for index, biological_standard in enumerate(biological_standards):
1435        insert_method_for_bio_standard = biological_standards_table.insert().values({
1436            "name": biological_standard,
1437            "identifier": identifiers[index],
1438            "chromatography": method_id,
1439            "num_pos_features": 0,
1440            "num_neg_features": 0,
1441            "msdial_config_id": "Default"})
1442        connection.execute(insert_method_for_bio_standard)
1443
1444    # Execute INSERT to database, then close the connection
1445    connection.close()

Inserts new chromatography method in the "chromatography_methods" table of the Settings database.

Arguments:
  • method_id (str): Name of the chromatography method
Returns:

None

def remove_chromatography_method(method_id):
1448def remove_chromatography_method(method_id):
1449
1450    """
1451    Deletes chromatography method and all associated records from the Settings database.
1452
1453    Details:
1454        1. Removes chromatography method in "chromatography_methods" table
1455        2. Removes method from "biological_standards" table
1456        3. Removes associated internal standards from "internal_standards" table
1457        4. Removes associated targeted features from "targeted_features" table
1458        5. Deletes corresponding MSPs from folders
1459        6. Deletes corresponding MSPs from Google Drive (if sync is enabled)
1460
1461    Args:
1462        method_id (str): Name of the chromatography method
1463
1464    Returns:
1465        None
1466    """
1467
1468    # Delete corresponding MSPs from "methods" directory
1469    df = get_table("Settings", "chromatography_methods")
1470    df = df.loc[df["method_id"] == method_id]
1471
1472    df2 = get_table("Settings", "biological_standards")
1473    df2 = df2.loc[df2["chromatography"] == method_id]
1474
1475    files_to_delete = df["pos_istd_msp_file"].astype(str).tolist() + df["neg_istd_msp_file"].astype(str).tolist() + \
1476        df2["pos_bio_msp_file"].astype(str).tolist() + df2["neg_bio_msp_file"].astype(str).tolist()
1477
1478    for file in os.listdir(methods_directory):
1479        if file in files_to_delete:
1480            os.remove(os.path.join(methods_directory, file))
1481
1482    # Connect to database and get relevant tables
1483    db_metadata, connection = connect_to_database("Settings")
1484    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1485    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
1486    internal_standards_table = sa.Table("internal_standards", db_metadata, autoload=True)
1487    targeted_features_table = sa.Table("targeted_features", db_metadata, autoload=True)
1488
1489    delete_queries = []
1490
1491    # Remove from "chromatography_methods" table
1492    delete_chromatography_method = (
1493        sa.delete(chromatography_table)
1494            .where((chromatography_table.c.method_id == method_id))
1495    )
1496
1497    delete_queries.append(delete_chromatography_method)
1498
1499    # Remove all entries in other tables associated with chromatography
1500    for table in [biological_standards_table, internal_standards_table, targeted_features_table]:
1501        delete_from_table = (
1502            sa.delete(table)
1503                .where((table.c.chromatography == method_id))
1504        )
1505        delete_queries.append(delete_from_table)
1506
1507    # Execute all deletes, then close the connection
1508    for query in delete_queries:
1509        connection.execute(query)
1510
1511    connection.close()

Deletes chromatography method and all associated records from the Settings database.

Details:
  1. Removes chromatography method in "chromatography_methods" table
  2. Removes method from "biological_standards" table
  3. Removes associated internal standards from "internal_standards" table
  4. Removes associated targeted features from "targeted_features" table
  5. Deletes corresponding MSPs from folders
  6. Deletes corresponding MSPs from Google Drive (if sync is enabled)
Arguments:
  • method_id (str): Name of the chromatography method
Returns:

None

def update_msdial_config_for_internal_standards(chromatography, config_id):
1514def update_msdial_config_for_internal_standards(chromatography, config_id):
1515
1516    """
1517    Updates MS-DIAL configuration for a given chromatography method.
1518
1519    This MS-DIAL configuration will be used to generate a parameters file
1520    for processing samples run with this chromatography method.
1521
1522    Args:
1523        chromatography (str):
1524            Chromatography method ID (name)
1525        config_id (str):
1526            MS-DIAL configuration ID (name)
1527
1528    Returns:
1529        None
1530    """
1531
1532    # Connect to database and get relevant tables
1533    db_metadata, connection = connect_to_database("Settings")
1534    methods_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1535
1536    # Update MS-DIAL configuration for chromatography method
1537    update_msdial_config = (
1538        sa.update(methods_table)
1539            .where(methods_table.c.method_id == chromatography)
1540            .values(msdial_config_id=config_id)
1541    )
1542
1543    connection.execute(update_msdial_config)
1544    connection.close()

Updates MS-DIAL configuration for a given chromatography method.

This MS-DIAL configuration will be used to generate a parameters file for processing samples run with this chromatography method.

Arguments:
  • chromatography (str): Chromatography method ID (name)
  • config_id (str): MS-DIAL configuration ID (name)
Returns:

None

def add_msp_to_database(msp_file, chromatography, polarity, bio_standard=None):
1547def add_msp_to_database(msp_file, chromatography, polarity, bio_standard=None):
1548
1549    """
1550    Parses compounds from MSP into the Settings database.
1551
1552    This function writes features from an MSP file into the "internal_standards" or "targeted_features" table,
1553    and inserts location of pos/neg MSP files into "chromatography_methods" table.
1554
1555    TODO: The MSP/TXT libraries have standardized names; there is no need to store the filename in the database.
1556
1557    Args:
1558        msp_file (io.StringIO):
1559            In-memory text-stream file object
1560        chromatography (str):
1561            Chromatography method ID (name)
1562        polarity (str):
1563            Polarity for which MSP should be used for ("Positive Mode" or "Negative Mode")
1564        bio_standard (str, default None):
1565            Parses MSP and applies to biological standard within a chromatography-polarity combination
1566
1567    Returns:
1568        None
1569    """
1570
1571    # Connect to database
1572    db_metadata, connection = connect_to_database("Settings")
1573
1574    # Write MSP file to folder, store file path in database (further down in function)
1575    if not os.path.exists(methods_directory):
1576        os.makedirs(methods_directory)
1577
1578    if bio_standard is not None:
1579        if polarity == "Positive Mode":
1580            filename = bio_standard.replace(" ", "_") + "_" + chromatography + "_Pos.msp"
1581        elif polarity == "Negative Mode":
1582            filename = bio_standard.replace(" ", "_") + "_" + chromatography + "_Neg.msp"
1583    else:
1584        if polarity == "Positive Mode":
1585            filename = chromatography + "_Pos.msp"
1586        elif polarity == "Negative Mode":
1587            filename = chromatography + "_Neg.msp"
1588
1589    msp_file_path = os.path.join(methods_directory, filename)
1590
1591    with open(msp_file_path, "w") as file:
1592        msp_file.seek(0)
1593        shutil.copyfileobj(msp_file, file)
1594
1595    # Read MSP file
1596    with open(msp_file_path, "r") as msp:
1597
1598        list_of_features = []
1599
1600        # Split MSP into list of compounds
1601        data = msp.read().split("\n\n")
1602        data = [element.split("\n") for element in data]
1603
1604        # Add each line of each compound into a list
1605        for feature in data:
1606            if len(feature) != 1:
1607                list_of_features.append(feature)
1608
1609        features_dict = {}
1610        added_features = []
1611
1612        # Iterate through features in MSP
1613        for feature_index, feature in enumerate(list_of_features):
1614
1615            features_dict[feature_index] = {
1616                "Name": None,
1617                "Precursor m/z": None,
1618                "Retention time": None,
1619                "INCHIKEY": None,
1620                "MS2 spectrum": None
1621            }
1622
1623            # Iterate through each line of each feature in the MSP
1624            for data_index, feature_data in enumerate(feature):
1625
1626                # Capture, name, inchikey, m/z, and RT
1627                if "NAME" in feature_data.upper():
1628                    feature_name = feature_data.split(": ")[-1]
1629                    if feature_name not in added_features:
1630                        added_features.append(feature_name)
1631                        features_dict[feature_index]["Name"] = feature_name
1632                        continue
1633                    else:
1634                        break
1635                elif "PRECURSORMZ" in feature_data.upper():
1636                    features_dict[feature_index]["Precursor m/z"] = feature_data.split(": ")[-1]
1637                    continue
1638                elif "INCHIKEY" in feature_data.upper():
1639                    features_dict[feature_index]["INCHIKEY"] = feature_data.split(": ")[-1]
1640                    continue
1641                elif "RETENTIONTIME" in feature_data.upper():
1642                    features_dict[feature_index]["Retention time"] = feature_data.split(": ")[-1]
1643                    continue
1644
1645                # Capture MS2 spectrum
1646                elif "Num Peaks" in feature_data:
1647
1648                    # Get number of peaks in MS2 spectrum
1649                    num_peaks = int(feature_data.split(": ")[-1])
1650
1651                    # Each line in the MSP corresponds to a peak
1652                    start_index = data_index + 1
1653                    end_index = data_index + num_peaks + 1
1654
1655                    # Each peak is represented as a string e.g. "56.04977\t247187"
1656                    peaks_in_spectrum = []
1657                    for peak in feature[start_index:end_index]:
1658                        peaks_in_spectrum.append(peak.replace("\t", ":"))
1659
1660                    features_dict[feature_index]["MS2 spectrum"] = str(peaks_in_spectrum)
1661                    break
1662
1663    features_dict = { key:value for key, value in features_dict.items() if value["Name"] is not None }
1664
1665    # Adding MSP for biological standards
1666    if bio_standard is not None:
1667
1668        # Get "targeted_features" table
1669        targeted_features_table = sa.Table("targeted_features", db_metadata, autoload=True)
1670
1671        # Prepare DELETE of old targeted features
1672        delete_old_targeted_features = (
1673            sa.delete(targeted_features_table)
1674                .where((targeted_features_table.c.chromatography == chromatography)
1675                       & (targeted_features_table.c.polarity == polarity)
1676                       & (targeted_features_table.c.biological_standard == bio_standard))
1677        )
1678
1679        # Execute DELETE
1680        connection.execute(delete_old_targeted_features)
1681
1682        # Execute INSERT of each targeted feature into targeted_features table
1683        for feature in features_dict:
1684            insert_feature = targeted_features_table.insert().values(
1685                {"name": features_dict[feature]["Name"],
1686                 "chromatography": chromatography,
1687                 "polarity": polarity,
1688                 "biological_standard": bio_standard,
1689                 "precursor_mz": features_dict[feature]["Precursor m/z"],
1690                 "retention_time": features_dict[feature]["Retention time"],
1691                 "ms2_spectrum": features_dict[feature]["MS2 spectrum"],
1692                 "inchikey": features_dict[feature]["INCHIKEY"]})
1693            connection.execute(insert_feature)
1694
1695        # Get "biological_standards" table
1696        biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
1697
1698        # Write location of msp file to respective cell
1699        if polarity == "Positive Mode":
1700            update_msp_file = (
1701                sa.update(biological_standards_table)
1702                    .where((biological_standards_table.c.chromatography == chromatography)
1703                           & (biological_standards_table.c.name == bio_standard))
1704                    .values(num_pos_features=len(features_dict),
1705                            pos_bio_msp_file=filename)
1706            )
1707        elif polarity == "Negative Mode":
1708            update_msp_file = (
1709                sa.update(biological_standards_table)
1710                    .where((biological_standards_table.c.chromatography == chromatography)
1711                           & (biological_standards_table.c.name == bio_standard))
1712                    .values(num_neg_features=len(features_dict),
1713                            neg_bio_msp_file=filename)
1714            )
1715
1716        # Execute UPDATE of MSP file location
1717        connection.execute(update_msp_file)
1718
1719    # Adding MSP for internal standards
1720    else:
1721
1722        # Get internal_standards table
1723        internal_standards_table = sa.Table("internal_standards", db_metadata, autoload=True)
1724
1725        # Prepare DELETE of old internal standards
1726        delete_old_internal_standards = (
1727            sa.delete(internal_standards_table)
1728                .where((internal_standards_table.c.chromatography == chromatography)
1729                       & (internal_standards_table.c.polarity == polarity))
1730        )
1731
1732        # Execute DELETE
1733        connection.execute(delete_old_internal_standards)
1734
1735        # Execute INSERT of each internal standard into internal_standards table
1736        for feature in features_dict:
1737            insert_feature = internal_standards_table.insert().values(
1738                {"name": features_dict[feature]["Name"],
1739                 "chromatography": chromatography,
1740                 "polarity": polarity,
1741                 "precursor_mz": features_dict[feature]["Precursor m/z"],
1742                 "retention_time": features_dict[feature]["Retention time"],
1743                 "ms2_spectrum": features_dict[feature]["MS2 spectrum"],
1744                 "inchikey": features_dict[feature]["INCHIKEY"]})
1745            connection.execute(insert_feature)
1746
1747        # Get "chromatography" table
1748        chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1749
1750        # Write location of msp file to respective cell
1751        if polarity == "Positive Mode":
1752            update_msp_file = (
1753                sa.update(chromatography_table)
1754                    .where(chromatography_table.c.method_id == chromatography)
1755                    .values(num_pos_standards=len(features_dict),
1756                            pos_istd_msp_file=filename)
1757            )
1758        elif polarity == "Negative Mode":
1759            update_msp_file = (
1760                sa.update(chromatography_table)
1761                    .where(chromatography_table.c.method_id == chromatography)
1762                    .values(num_neg_standards=len(features_dict),
1763                            neg_istd_msp_file=filename)
1764            )
1765
1766        # Execute UPDATE of MSP file location
1767        connection.execute(update_msp_file)
1768
1769    # If the corresponding TXT library existed, delete it
1770    txt_library = os.path.join(methods_directory, filename.replace(".msp", ".txt"))
1771    os.remove(txt_library) if os.path.exists(txt_library) else None
1772
1773    # Close the connection
1774    connection.close()

Parses compounds from MSP into the Settings database.

This function writes features from an MSP file into the "internal_standards" or "targeted_features" table, and inserts location of pos/neg MSP files into "chromatography_methods" table.

TODO: The MSP/TXT libraries have standardized names; there is no need to store the filename in the database.

Arguments:
  • msp_file (io.StringIO): In-memory text-stream file object
  • chromatography (str): Chromatography method ID (name)
  • polarity (str): Polarity for which MSP should be used for ("Positive Mode" or "Negative Mode")
  • bio_standard (str, default None): Parses MSP and applies to biological standard within a chromatography-polarity combination
Returns:

None

def add_csv_to_database(csv_file, chromatography, polarity):
1777def add_csv_to_database(csv_file, chromatography, polarity):
1778
1779    """
1780    Parses compounds from a CSV file into the Settings database.
1781
1782    Parses compounds from a CSV into the "internal_standards" table, and stores
1783    the location of the pos/neg TXT files in "chromatography_methods" table.
1784
1785    TODO: The MSP/TXT libraries have standardized names; there is no need to store the filename in the database.
1786
1787    Args:
1788        csv_file (io.StringIO):
1789            In-memory text-stream file object
1790        chromatography (str):
1791            Chromatography method ID (name)
1792        polarity (str):
1793            Polarity for which MSP should be used for ("Positive Mode" or "Negative Mode")
1794
1795    Returns:
1796        None
1797    """
1798
1799    # Convert CSV file into Python dictionary
1800    df_internal_standards = pd.read_csv(csv_file, index_col=False)
1801    internal_standards_dict = df_internal_standards.to_dict("index")
1802
1803    # Create methods directory if it doesn't already exist
1804    if not os.path.exists(methods_directory):
1805        os.makedirs(methods_directory)
1806
1807    # Name file accordingly
1808    if polarity == "Positive Mode":
1809        filename = chromatography + "_Pos.txt"
1810    elif polarity == "Negative Mode":
1811        filename = chromatography + "_Neg.txt"
1812
1813    txt_file_path = os.path.join(methods_directory, filename)
1814
1815    # Write CSV columns to tab-delimited text file
1816    df_internal_standards.to_csv(txt_file_path, sep="\t", index=False)
1817
1818    # Connect to database
1819    db_metadata, connection = connect_to_database("Settings")
1820
1821    # Get internal_standards table
1822    internal_standards_table = sa.Table("internal_standards", db_metadata, autoload=True)
1823
1824    # Prepare DELETE of old internal standards
1825    delete_old_internal_standards = (
1826        sa.delete(internal_standards_table)
1827            .where((internal_standards_table.c.chromatography == chromatography)
1828                   & (internal_standards_table.c.polarity == polarity))
1829    )
1830
1831    # Execute DELETE
1832    connection.execute(delete_old_internal_standards)
1833
1834    # Execute INSERT of each internal standard into internal_standards table
1835    for row in internal_standards_dict.keys():
1836        insert_standard = internal_standards_table.insert().values(
1837            {"name": internal_standards_dict[row]["Common Name"],
1838             "chromatography": chromatography,
1839             "polarity": polarity,
1840             "precursor_mz": internal_standards_dict[row]["MS1 m/z"],
1841             "retention_time": internal_standards_dict[row]["RT (min)"]})
1842        connection.execute(insert_standard)
1843
1844    # Get "chromatography" table
1845    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
1846
1847    # Write location of CSV file to respective cell
1848    if polarity == "Positive Mode":
1849        update_msp_file = (
1850            sa.update(chromatography_table)
1851                .where(chromatography_table.c.method_id == chromatography)
1852                .values(num_pos_standards=len(internal_standards_dict),
1853                        pos_istd_msp_file=filename)
1854        )
1855    elif polarity == "Negative Mode":
1856        update_msp_file = (
1857            sa.update(chromatography_table)
1858                .where(chromatography_table.c.method_id == chromatography)
1859                .values(num_neg_standards=len(internal_standards_dict),
1860                        neg_istd_msp_file=filename)
1861        )
1862
1863    # Execute UPDATE of CSV file location
1864    connection.execute(update_msp_file)
1865
1866    # If the corresponding MSP library existed, delete it
1867    msp_library = os.path.join(methods_directory, filename.replace(".txt", ".msp"))
1868    os.remove(msp_library) if os.path.exists(msp_library) else None
1869
1870    # Close the connection
1871    connection.close()

Parses compounds from a CSV file into the Settings database.

Parses compounds from a CSV into the "internal_standards" table, and stores the location of the pos/neg TXT files in "chromatography_methods" table.

TODO: The MSP/TXT libraries have standardized names; there is no need to store the filename in the database.

Arguments:
  • csv_file (io.StringIO): In-memory text-stream file object
  • chromatography (str): Chromatography method ID (name)
  • polarity (str): Polarity for which MSP should be used for ("Positive Mode" or "Negative Mode")
Returns:

None

def get_msdial_configurations():
1874def get_msdial_configurations():
1875
1876    """
1877    Returns list of user configurations of MS-DIAL parameters from Settings database.
1878    """
1879
1880    engine = sa.create_engine(settings_database)
1881    df_msdial_configurations = pd.read_sql("SELECT * FROM msdial_parameters", engine)
1882    return df_msdial_configurations["config_name"].astype(str).tolist()

Returns list of user configurations of MS-DIAL parameters from Settings database.

def generate_msdial_parameters_file(chromatography, polarity, msp_file_path, bio_standard=None):
1885def generate_msdial_parameters_file(chromatography, polarity, msp_file_path, bio_standard=None):
1886
1887    """
1888    Uses parameters from user-curated MS-DIAL configuration to create a parameters.txt file for MS-DIAL.
1889
1890    TODO: Currently, this function is only called upon a new job setup. To allow changes during a QC job,
1891        this function should be called every time the user makes a configuration save in Settings > MS-DIAL Configurations.
1892
1893    Args:
1894        chromatography (str):
1895            Chromatography method ID (name)
1896        polarity (str):
1897            Polarity ("Positive" or "Negative")
1898        msp_file_path (str):
1899            MSP library file path
1900        bio_standard (str, default None):
1901            Specifies that the parameters file is for a biological standard
1902
1903    Returns:
1904        None
1905    """
1906
1907    # Get parameters of selected configuration
1908    if bio_standard is not None:
1909        df_bio_standards = get_biological_standards()
1910        df_bio_standards = df_bio_standards.loc[
1911            (df_bio_standards["chromatography"] == chromatography) & (df_bio_standards["name"] == bio_standard)]
1912        config_name = df_bio_standards["msdial_config_id"].astype(str).values[0]
1913    else:
1914        df_methods = get_chromatography_methods()
1915        df_methods = df_methods.loc[df_methods["method_id"] == chromatography]
1916        config_name = df_methods["msdial_config_id"].astype(str).values[0]
1917
1918    parameters = get_msdial_configuration_parameters(config_name)
1919
1920    # Create "methods" directory if it does not exist
1921    if not os.path.exists(methods_directory):
1922        os.makedirs(methods_directory)
1923
1924    # Name parameters file accordingly
1925    if bio_standard is not None:
1926        if polarity == "Positive":
1927            filename = bio_standard.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Pos.txt"
1928        elif polarity == "Negative":
1929            filename = bio_standard.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Neg.txt"
1930    else:
1931        if polarity == "Positive":
1932            filename = chromatography.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Pos.txt"
1933        elif polarity == "Negative":
1934            filename = chromatography.replace(" ", "_") + "_" + config_name.replace(" ", "_") + "_Parameters_Neg.txt"
1935
1936    parameters_file = os.path.join(methods_directory, filename)
1937
1938    # Some specifications based on polarity / file type for the parameters
1939    if polarity == "Positive":
1940        adduct_type = "[M+H]+"
1941    elif polarity == "Negative":
1942        adduct_type = "[M-H]-"
1943
1944    if msp_file_path.endswith(".msp"):
1945        filepath = "MSP file: " + msp_file_path
1946    elif msp_file_path.endswith(".txt"):
1947        filepath = "Text file: " + msp_file_path
1948
1949    # Text file contents
1950    lines = [
1951        "#Data type",
1952        "MS1 data type: Centroid",
1953        "MS2 data type: Centroid",
1954        "Ion mode: " + polarity,
1955        "DIA file:", "\n"
1956
1957        "#Data collection parameters",
1958        "Retention time begin: " + str(parameters[0]),
1959        "Retention time end: " + str(parameters[1]),
1960        "Mass range begin: " + str(parameters[2]),
1961        "Mass range end: " + str(parameters[3]), "\n",
1962
1963        "#Centroid parameters",
1964        "MS1 tolerance for centroid: " + str(parameters[4]),
1965        "MS2 tolerance for centroid: " + str(parameters[5]), "\n",
1966
1967        "#Peak detection parameters",
1968        "Smoothing method: " + str(parameters[6]),
1969        "Smoothing level: " + str(parameters[7]),
1970        "Minimum peak width: " + str(parameters[8]),
1971        "Minimum peak height: " + str(parameters[9]),
1972        "Mass slice width: " + str(parameters[10]), "\n",
1973
1974        "#Deconvolution parameters",
1975        "Sigma window value: 0.5",
1976        "Amplitude cut off: 0", "\n",
1977
1978        "#Adduct list",
1979        "Adduct list: " + adduct_type, "\n",
1980
1981        "#Text file and post identification (retention time and accurate mass based) setting",
1982        filepath,
1983        "Retention time tolerance for post identification: " + str(parameters[11]),
1984        "Accurate ms1 tolerance for post identification: " + str(parameters[12]),
1985        "Post identification score cut off: " + str(parameters[13]), "\n",
1986
1987        "#Alignment parameters setting",
1988        "Retention time tolerance for alignment: " + str(parameters[14]),
1989        "MS1 tolerance for alignment: " + str(parameters[15]),
1990        "Retention time factor for alignment: " + str(parameters[16]),
1991        "MS1 factor for alignment: " + str(parameters[17]),
1992        "Peak count filter: " + str(parameters[18]),
1993        "QC at least filter: " + str(parameters[19]),
1994    ]
1995
1996    # Write parameters to a text file
1997    with open(parameters_file, "w") as file:
1998        for line in lines:
1999            file.write(line)
2000            if line != "\n":
2001                file.write("\n")
2002
2003    # Write path of parameters text file to chromatography method in database
2004    db_metadata, connection = connect_to_database("Settings")
2005    chromatography_table = sa.Table("chromatography_methods", db_metadata, autoload=True)
2006    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2007
2008    # For processing biological standard samples
2009    if bio_standard is not None:
2010        if polarity == "Positive":
2011            update_parameter_file = (
2012                sa.update(biological_standards_table)
2013                    .where((biological_standards_table.c.chromatography == chromatography)
2014                           & (biological_standards_table.c.name == bio_standard))
2015                    .values(pos_parameter_file=parameters_file)
2016            )
2017        elif polarity == "Negative":
2018            update_parameter_file = (
2019                sa.update(biological_standards_table)
2020                    .where((biological_standards_table.c.chromatography == chromatography)
2021                           & (biological_standards_table.c.name == bio_standard))
2022                    .values(neg_parameter_file=parameters_file)
2023            )
2024    # For processing samples with internal standards
2025    else:
2026        if polarity == "Positive":
2027            update_parameter_file = (
2028                sa.update(chromatography_table)
2029                    .where(chromatography_table.c.method_id == chromatography)
2030                    .values(pos_parameter_file=parameters_file)
2031            )
2032        elif polarity == "Negative":
2033            update_parameter_file = (
2034                sa.update(chromatography_table)
2035                    .where(chromatography_table.c.method_id == chromatography)
2036                    .values(neg_parameter_file=parameters_file)
2037            )
2038
2039    connection.execute(update_parameter_file)
2040    connection.close()

Uses parameters from user-curated MS-DIAL configuration to create a parameters.txt file for MS-DIAL.

TODO: Currently, this function is only called upon a new job setup. To allow changes during a QC job, this function should be called every time the user makes a configuration save in Settings > MS-DIAL Configurations.

Arguments:
  • chromatography (str): Chromatography method ID (name)
  • polarity (str): Polarity ("Positive" or "Negative")
  • msp_file_path (str): MSP library file path
  • bio_standard (str, default None): Specifies that the parameters file is for a biological standard
Returns:

None

def add_msdial_configuration(msdial_config_name):
2043def add_msdial_configuration(msdial_config_name):
2044
2045    """
2046    Inserts new user configuration of MS-DIAL parameters into the "msdial_parameters" table in Settings database.
2047
2048    Args:
2049        msdial_config_name (str): MS-DIAL configuration ID
2050
2051    Returns:
2052        None
2053    """
2054
2055    # Connect to database
2056    db_metadata, connection = connect_to_database("Settings")
2057
2058    # Get MS-DIAL parameters table
2059    msdial_parameters_table = sa.Table("msdial_parameters", db_metadata, autoload=True)
2060
2061    # Prepare insert of user-inputted run data
2062    insert_config = msdial_parameters_table.insert().values(
2063        {"config_name": msdial_config_name,
2064         "rt_begin": 0,
2065         "rt_end": 100,
2066         "mz_begin": 0,
2067         "mz_end": 2000,
2068         "ms1_centroid_tolerance": 0.008,
2069         "ms2_centroid_tolerance": 0.01,
2070         "smoothing_method": "LinearWeightedMovingAverage",
2071         "smoothing_level": 3,
2072         "min_peak_width": 3,
2073         "min_peak_height": 35000,
2074         "mass_slice_width": 0.1,
2075         "post_id_rt_tolerance": 0.3,
2076         "post_id_mz_tolerance": 0.008,
2077         "post_id_score_cutoff": 85,
2078         "alignment_rt_tolerance": 0.05,
2079         "alignment_mz_tolerance": 0.008,
2080         "alignment_rt_factor": 0.5,
2081         "alignment_mz_factor": 0.5,
2082         "peak_count_filter": 0,
2083         "qc_at_least_filter": "True"}
2084    )
2085
2086    # Execute INSERT to database, then close the connection
2087    connection.execute(insert_config)
2088    connection.close()

Inserts new user configuration of MS-DIAL parameters into the "msdial_parameters" table in Settings database.

Arguments:
  • msdial_config_name (str): MS-DIAL configuration ID
Returns:

None

def remove_msdial_configuration(msdial_config_name):
2091def remove_msdial_configuration(msdial_config_name):
2092
2093    """
2094    Deletes user configuration of MS-DIAL parameters from the "msdial_parameters" table.
2095
2096    Args:
2097        msdial_config_name (str): MS-DIAL configuration ID
2098
2099    Returns:
2100        None
2101    """
2102
2103    # Connect to database
2104    db_metadata, connection = connect_to_database("Settings")
2105
2106    # Get MS-DIAL parameters table
2107    msdial_parameters_table = sa.Table("msdial_parameters", db_metadata, autoload=True)
2108
2109    # Prepare DELETE of MS-DIAL configuration
2110    delete_config = (
2111        sa.delete(msdial_parameters_table)
2112            .where(msdial_parameters_table.c.config_name == msdial_config_name)
2113    )
2114
2115    # Execute DELETE, then close the connection
2116    connection.execute(delete_config)
2117    connection.close()

Deletes user configuration of MS-DIAL parameters from the "msdial_parameters" table.

Arguments:
  • msdial_config_name (str): MS-DIAL configuration ID
Returns:

None

def get_msdial_configuration_parameters(msdial_config_name, parameter=None):
2120def get_msdial_configuration_parameters(msdial_config_name, parameter=None):
2121
2122    """
2123    Returns tuple of parameters defined for a selected MS-DIAL configuration.
2124
2125    TODO: The MS-DIAL configuration is returned as a tuple for a concise implementation of get_msdial_parameters_for_config()
2126        in the DashWebApp module. While convenient there, this function is not optimal for maintainability. Should return
2127        the entire DataFrame record instead.
2128
2129    See update_msdial_configuration() for details on parameters.
2130
2131    Args:
2132        msdial_config_name (str):
2133            MS-DIAL configuration ID
2134        parameter (str, default None):
2135            If specified, returns only the value for the given parameter
2136
2137    Returns:
2138        Tuple of parameters for the given MS-DIAL configuration, or single parameter value.
2139    """
2140
2141    # Get "msdial_parameters" table from database as a DataFrame
2142    engine = sa.create_engine(settings_database)
2143    df_configurations = pd.read_sql("SELECT * FROM msdial_parameters", engine)
2144
2145    # Get selected configuration
2146    selected_config = df_configurations.loc[
2147        df_configurations["config_name"] == msdial_config_name]
2148
2149    selected_config.drop(["id", "config_name"], inplace=True, axis=1)
2150
2151    if parameter is not None:
2152        return selected_config[parameter].values[0]
2153    else:
2154        return tuple(selected_config.to_records(index=False)[0])

Returns tuple of parameters defined for a selected MS-DIAL configuration.

TODO: The MS-DIAL configuration is returned as a tuple for a concise implementation of get_msdial_parameters_for_config() in the DashWebApp module. While convenient there, this function is not optimal for maintainability. Should return the entire DataFrame record instead.

See update_msdial_configuration() for details on parameters.

Arguments:
  • msdial_config_name (str): MS-DIAL configuration ID
  • parameter (str, default None): If specified, returns only the value for the given parameter
Returns:

Tuple of parameters for the given MS-DIAL configuration, or single parameter value.

def update_msdial_configuration( config_name, rt_begin, rt_end, mz_begin, mz_end, ms1_centroid_tolerance, ms2_centroid_tolerance, smoothing_method, smoothing_level, mass_slice_width, min_peak_width, min_peak_height, post_id_rt_tolerance, post_id_mz_tolerance, post_id_score_cutoff, alignment_rt_tolerance, alignment_mz_tolerance, alignment_rt_factor, alignment_mz_factor, peak_count_filter, qc_at_least_filter):
2157def update_msdial_configuration(config_name, rt_begin, rt_end, mz_begin, mz_end, ms1_centroid_tolerance,
2158    ms2_centroid_tolerance, smoothing_method, smoothing_level, mass_slice_width, min_peak_width, min_peak_height,
2159    post_id_rt_tolerance, post_id_mz_tolerance, post_id_score_cutoff, alignment_rt_tolerance, alignment_mz_tolerance,
2160    alignment_rt_factor, alignment_mz_factor, peak_count_filter, qc_at_least_filter):
2161
2162    """
2163    Updates and saves changes of all parameters for a selected MS-DIAL configuration.
2164
2165    For details on MS-DIAL parameters, see: https://mtbinfo-team.github.io/mtbinfo.github.io/MS-DIAL/tutorial#section-2-3
2166
2167    Args:
2168        config_name (str):
2169            Name / ID of MS-DIAL configuration
2170        rt_begin (int):
2171            Minimum retention time in RT range for analysis range
2172        rt_end (int):
2173            Maximum retention time in RT range for analysis
2174        mz_begin (float):
2175            Minimum precursor mass in m/z range for analysis range
2176        mz_end (float):
2177            Maximum precursor mass in m/z range for analysis range
2178        ms1_centroid_tolerance (float):
2179            MS1 centroid tolerance
2180        ms2_centroid_tolerance (float):
2181            MS2 centroid tolerance
2182        smoothing_method (str):
2183            Peak smoothing method for peak detection
2184        smoothing_level (int):
2185            Peak smoothing level
2186        mass_slice_width (float):
2187            Mass slice width
2188        min_peak_width (int):
2189            Minimum peak width threshold
2190        min_peak_height (int):
2191            Minimum peak height threshold
2192        post_id_rt_tolerance (float):
2193            Post-identification retention time tolerance
2194        post_id_mz_tolerance (float):
2195            Post-identification precursor m/z tolerance
2196        post_id_score_cutoff (int):
2197            Similarity score cutoff after peak identification
2198        alignment_rt_tolerance (float):
2199            Post-alignment retention time tolerance
2200        alignment_mz_tolerance (float):
2201            Post-alignment precursor m/z tolerance
2202        alignment_rt_factor (float):
2203            Post-alignment retention time factor
2204        alignment_mz_factor (float):
2205            Post-alignment precursor m/z tolerance
2206        peak_count_filter (int):
2207            Peak count filter
2208        qc_at_least_filter (str):
2209            QC at least filter
2210
2211    Returns:
2212        None
2213    """
2214
2215    # Connect to database
2216    db_metadata, connection = connect_to_database("Settings")
2217
2218    # Get MS-DIAL parameters table
2219    msdial_parameters_table = sa.Table("msdial_parameters", db_metadata, autoload=True)
2220
2221    # Prepare insert of user-inputted MS-DIAL parameters
2222    update_parameters = (
2223        sa.update(msdial_parameters_table)
2224            .where(msdial_parameters_table.c.config_name == config_name)
2225            .values(rt_begin=rt_begin,
2226                    rt_end=rt_end,
2227                    mz_begin=mz_begin,
2228                    mz_end=mz_end,
2229                    ms1_centroid_tolerance=ms1_centroid_tolerance,
2230                    ms2_centroid_tolerance=ms2_centroid_tolerance,
2231                    smoothing_method=smoothing_method,
2232                    smoothing_level=smoothing_level,
2233                    min_peak_width=min_peak_width,
2234                    min_peak_height=min_peak_height,
2235                    mass_slice_width=mass_slice_width,
2236                    post_id_rt_tolerance=post_id_rt_tolerance,
2237                    post_id_mz_tolerance=post_id_mz_tolerance,
2238                    post_id_score_cutoff=post_id_score_cutoff,
2239                    alignment_rt_tolerance=alignment_rt_tolerance,
2240                    alignment_mz_tolerance=alignment_mz_tolerance,
2241                    alignment_rt_factor=alignment_rt_factor,
2242                    alignment_mz_factor=alignment_mz_factor,
2243                    peak_count_filter=peak_count_filter,
2244                    qc_at_least_filter=qc_at_least_filter)
2245    )
2246
2247    # Execute UPDATE to database, then close the connection
2248    connection.execute(update_parameters)
2249    connection.close()

Updates and saves changes of all parameters for a selected MS-DIAL configuration.

For details on MS-DIAL parameters, see: https://mtbinfo-team.github.io/mtbinfo.github.io/MS-DIAL/tutorial#section-2-3

Arguments:
  • config_name (str): Name / ID of MS-DIAL configuration
  • rt_begin (int): Minimum retention time in RT range for analysis range
  • rt_end (int): Maximum retention time in RT range for analysis
  • mz_begin (float): Minimum precursor mass in m/z range for analysis range
  • mz_end (float): Maximum precursor mass in m/z range for analysis range
  • ms1_centroid_tolerance (float): MS1 centroid tolerance
  • ms2_centroid_tolerance (float): MS2 centroid tolerance
  • smoothing_method (str): Peak smoothing method for peak detection
  • smoothing_level (int): Peak smoothing level
  • mass_slice_width (float): Mass slice width
  • min_peak_width (int): Minimum peak width threshold
  • min_peak_height (int): Minimum peak height threshold
  • post_id_rt_tolerance (float): Post-identification retention time tolerance
  • post_id_mz_tolerance (float): Post-identification precursor m/z tolerance
  • post_id_score_cutoff (int): Similarity score cutoff after peak identification
  • alignment_rt_tolerance (float): Post-alignment retention time tolerance
  • alignment_mz_tolerance (float): Post-alignment precursor m/z tolerance
  • alignment_rt_factor (float): Post-alignment retention time factor
  • alignment_mz_factor (float): Post-alignment precursor m/z tolerance
  • peak_count_filter (int): Peak count filter
  • qc_at_least_filter (str): QC at least filter
Returns:

None

def get_msp_file_path(chromatography, polarity, bio_standard=None):
2252def get_msp_file_path(chromatography, polarity, bio_standard=None):
2253
2254    """
2255    Returns file paths of MSPs for a selected chromatography / polarity (both stored
2256    in the methods folder upon user upload) for MS-DIAL parameter file generation.
2257
2258    TODO: Once added to workspace, MSP / TXT library file names are standardized. No need to store / retrieve from database.
2259        Get the file path using the filename e.g. return directory + chromatography + "_" + polarity + ".msp".
2260
2261    Args:
2262        chromatography (str):
2263            Chromatography method ID
2264        polarity (str):
2265            Polarity, either "Positive" or "Negative"
2266        bio_standard (str, default None):
2267            Name of biological standard
2268
2269    Returns:
2270        MSP / TXT library file path.
2271    """
2272
2273    # Connect to database
2274    engine = sa.create_engine(settings_database)
2275
2276    if bio_standard is not None:
2277        # Get selected biological standard
2278        query = "SELECT * FROM biological_standards WHERE name = '" + bio_standard + "' AND chromatography='" + chromatography + "'"
2279        df_biological_standards = pd.read_sql(query, engine)
2280
2281        # Get file path of MSP in requested polarity
2282        if polarity == "Positive":
2283            msp_file_path = df_biological_standards["pos_bio_msp_file"].astype(str).values[0]
2284        elif polarity == "Negative":
2285            msp_file_path = df_biological_standards["neg_bio_msp_file"].astype(str).values[0]
2286
2287    else:
2288        # Get selected chromatography method
2289        query = "SELECT * FROM chromatography_methods WHERE method_id='" + chromatography + "'"
2290        df_methods = pd.read_sql(query, engine)
2291
2292        # Get file path of MSP in requested polarity
2293        if polarity == "Positive":
2294            msp_file_path = df_methods["pos_istd_msp_file"].astype(str).values[0]
2295        elif polarity == "Negative":
2296            msp_file_path = df_methods["neg_istd_msp_file"].astype(str).values[0]
2297
2298    msp_file_path = os.path.join(methods_directory, msp_file_path)
2299
2300    # Return file path
2301    return msp_file_path

Returns file paths of MSPs for a selected chromatography / polarity (both stored in the methods folder upon user upload) for MS-DIAL parameter file generation.

TODO: Once added to workspace, MSP / TXT library file names are standardized. No need to store / retrieve from database. Get the file path using the filename e.g. return directory + chromatography + "_" + polarity + ".msp".

Arguments:
  • chromatography (str): Chromatography method ID
  • polarity (str): Polarity, either "Positive" or "Negative"
  • bio_standard (str, default None): Name of biological standard
Returns:

MSP / TXT library file path.

def get_parameter_file_path(chromatography, polarity, biological_standard=None):
2304def get_parameter_file_path(chromatography, polarity, biological_standard=None):
2305
2306    """
2307    Returns file path of parameters file stored in database.
2308
2309    TODO: Once generated, MS-DIAL parameter filenames are standardized. No need to store / retrieve from database.
2310        Get the file path using the filename e.g. return directory + chromatography + "_" + polarity + "_Parameters.txt".
2311
2312    Args:
2313        chromatography (str):
2314            Chromatography method ID
2315        polarity (str):
2316            Polarity, either "Positive" or "Negative"
2317        bio_standard (str, default None):
2318            Name of biological standard
2319
2320    Returns:
2321        File path for MS-DIAL parameters.txt file.
2322    """
2323
2324    engine = sa.create_engine(settings_database)
2325
2326    if biological_standard is not None:
2327        query = "SELECT * FROM biological_standards WHERE chromatography='" + chromatography + \
2328                "' AND name ='" + biological_standard + "'"
2329    else:
2330        query = "SELECT * FROM chromatography_methods WHERE method_id='" + chromatography + "'"
2331
2332    df = pd.read_sql(query, engine)
2333
2334    if polarity == "Pos":
2335        parameter_file = df["pos_parameter_file"].astype(str).values[0]
2336    elif polarity == "Neg":
2337        parameter_file = df["neg_parameter_file"].astype(str).values[0]
2338
2339    return parameter_file

Returns file path of parameters file stored in database.

TODO: Once generated, MS-DIAL parameter filenames are standardized. No need to store / retrieve from database. Get the file path using the filename e.g. return directory + chromatography + "_" + polarity + "_Parameters.txt".

Arguments:
  • chromatography (str): Chromatography method ID
  • polarity (str): Polarity, either "Positive" or "Negative"
  • bio_standard (str, default None): Name of biological standard
Returns:

File path for MS-DIAL parameters.txt file.

def get_msdial_directory():
2342def get_msdial_directory():
2343
2344    """
2345    Returns location of MS-DIAL directory.
2346    """
2347
2348    return get_table("Settings", "workspace")["msdial_directory"].astype(str).values[0]

Returns location of MS-DIAL directory.

def get_msconvert_directory():
2351def get_msconvert_directory():
2352
2353    """
2354    Returns location of MSConvert directory.
2355
2356    This function uses the MS-DIAL directory path to retrieve user ID, which it then uses to
2357    retrieve the path for MSConvert.exe in C:/Users/<username>/AppData/Local/Apps.
2358
2359    TODO: There is probably a better way to implement this.
2360
2361    Returns:
2362        Location of MSConvert directory in C:/Users/<username>/AppData/Local/Apps/ProteoWizard.
2363    """
2364
2365    user = get_msdial_directory().replace("\\", "/").split("/")[2]
2366    msconvert_folder = [f.path for f in os.scandir("C:/Users/" + user + "/AppData/Local/Apps/") if f.is_dir() and "ProteoWizard" in f.name][0]
2367    return msconvert_folder

Returns location of MSConvert directory.

This function uses the MS-DIAL directory path to retrieve user ID, which it then uses to retrieve the path for MSConvert.exe in C:/Users//AppData/Local/Apps.

TODO: There is probably a better way to implement this.

Returns:

Location of MSConvert directory in C:/Users//AppData/Local/Apps/ProteoWizard.

def update_msdial_directory(msdial_directory):
2370def update_msdial_directory(msdial_directory):
2371
2372    """
2373    Updates location of MS-DIAL directory, stored in "workspace" table of the Settings database.
2374
2375    Args:
2376        msdial_directory (str): New MS-DIAL directory location
2377
2378    Returns:
2379        None
2380    """
2381
2382    db_metadata, connection = connect_to_database("Settings")
2383    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
2384
2385    update_msdial_directory = (
2386        sa.update(workspace_table)
2387            .where(workspace_table.c.id == 1)
2388            .values(msdial_directory=msdial_directory)
2389    )
2390
2391    connection.execute(update_msdial_directory)
2392    connection.close()

Updates location of MS-DIAL directory, stored in "workspace" table of the Settings database.

Arguments:
  • msdial_directory (str): New MS-DIAL directory location
Returns:

None

def get_internal_standards_dict(chromatography, value_type):
2395def get_internal_standards_dict(chromatography, value_type):
2396
2397    """
2398    Returns dictionary of internal standard keys mapped to either m/z or RT values.
2399
2400    This function is used to establish a y-axis range for internal standard retention time plots.
2401    See load_istd_rt_plot() in the PlotGeneration module.
2402
2403    TODO: This function needs to filter for polarity!
2404
2405    Args:
2406        chromatography (str):
2407            Chromatography method to retrieve internal standards for
2408        value_type (str):
2409            Data type ("precursor_mz", "retention_time", "ms2_spectrum")
2410
2411    Returns:
2412        Dictionary with key-value pairs of { internal_standard: value_type }
2413    """
2414
2415    engine = sa.create_engine(settings_database)
2416    query = "SELECT * FROM internal_standards " + "WHERE chromatography='" + chromatography + "'"
2417    df_internal_standards = pd.read_sql(query, engine)
2418
2419    dict = {}
2420    keys = df_internal_standards["name"].astype(str).tolist()
2421    values = df_internal_standards[value_type].astype(float).tolist()
2422
2423    for index, key in enumerate(keys):
2424        dict[key] = values[index]
2425
2426    return dict

Returns dictionary of internal standard keys mapped to either m/z or RT values.

This function is used to establish a y-axis range for internal standard retention time plots. See load_istd_rt_plot() in the PlotGeneration module.

TODO: This function needs to filter for polarity!

Arguments:
  • chromatography (str): Chromatography method to retrieve internal standards for
  • value_type (str): Data type ("precursor_mz", "retention_time", "ms2_spectrum")
Returns:

Dictionary with key-value pairs of { internal_standard: value_type }

def get_internal_standards(chromatography, polarity):
2429def get_internal_standards(chromatography, polarity):
2430
2431    """
2432    Returns DataFrame of internal standards for a given chromatography method and polarity.
2433
2434    Args:
2435        chromatography (str):
2436            Chromatography method ID
2437        polarity (str):
2438            Polarity (either "Pos" or "Neg")
2439
2440    Returns:
2441        DataFrame of "internal_standards" table from Settings database, filtered by chromatography and polarity.
2442    """
2443
2444    if polarity == "Pos":
2445        polarity = "Positive Mode"
2446    elif polarity == "Neg":
2447        polarity = "Negative Mode"
2448
2449    engine = sa.create_engine(settings_database)
2450
2451    query = "SELECT * FROM internal_standards " + \
2452            "WHERE chromatography='" + chromatography + "' AND polarity='" + polarity + "'"
2453
2454    return pd.read_sql(query, engine)

Returns DataFrame of internal standards for a given chromatography method and polarity.

Arguments:
  • chromatography (str): Chromatography method ID
  • polarity (str): Polarity (either "Pos" or "Neg")
Returns:

DataFrame of "internal_standards" table from Settings database, filtered by chromatography and polarity.

def get_targeted_features(biological_standard, chromatography, polarity):
2457def get_targeted_features(biological_standard, chromatography, polarity):
2458
2459    """
2460    Returns DataFrame of metabolite targets for a given biological standard, chromatography, and polarity.
2461
2462    Args:
2463        biological_standard (str):
2464            Name of biological standard
2465        chromatography (str):
2466            Chromatography method ID (name)
2467        polarity (str):
2468            Polarity (either "Pos" or "Neg")
2469
2470    Returns:
2471        DataFrame of "targeted_features" table from Settings database, filtered by chromatography and polarity.
2472    """
2473
2474    if polarity == "Pos":
2475        polarity = "Positive Mode"
2476    elif polarity == "Neg":
2477        polarity = "Negative Mode"
2478
2479    engine = sa.create_engine(settings_database)
2480
2481    query = "SELECT * FROM targeted_features " + \
2482            "WHERE chromatography='" + chromatography + \
2483            "' AND polarity='" + polarity + \
2484            "' AND biological_standard ='" + biological_standard + "'"
2485
2486    return pd.read_sql(query, engine)

Returns DataFrame of metabolite targets for a given biological standard, chromatography, and polarity.

Arguments:
  • biological_standard (str): Name of biological standard
  • chromatography (str): Chromatography method ID (name)
  • polarity (str): Polarity (either "Pos" or "Neg")
Returns:

DataFrame of "targeted_features" table from Settings database, filtered by chromatography and polarity.

def get_biological_standards():
2489def get_biological_standards():
2490
2491    """
2492    Returns DataFrame of the "biological_standards" table from the Settings database.
2493    """
2494
2495    # Get table from database as a DataFrame
2496    engine = sa.create_engine(settings_database)
2497    df_biological_standards = pd.read_sql("SELECT * FROM biological_standards", engine)
2498    return df_biological_standards

Returns DataFrame of the "biological_standards" table from the Settings database.

def get_biological_standards_list():
2501def get_biological_standards_list():
2502
2503    """
2504    Returns list of biological standards from the Settings database.
2505    """
2506
2507    df_biological_standards = get_biological_standards()
2508    return df_biological_standards["name"].astype(str).unique().tolist()

Returns list of biological standards from the Settings database.

def add_biological_standard(name, identifier):
2511def add_biological_standard(name, identifier):
2512
2513    """
2514    Creates new biological standard with name and identifier.
2515
2516    The biological standard identifier is a text substring used to distinguish between sample and biological standard.
2517    MS-AutoQC checks filenames in the sequence for this identifier to process samples accordingly.
2518
2519    Args:
2520        name (str):
2521            Name of biological standard
2522        identifier (str):
2523            String identifier in filename for biological standard
2524
2525    Returns:
2526        None
2527    """
2528
2529    # Get list of chromatography methods
2530    chromatography_methods = get_chromatography_methods()["method_id"].tolist()
2531
2532    # Connect to database and get "biological_standards" table
2533    db_metadata, connection = connect_to_database("Settings")
2534    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2535
2536    # Insert a biological standard row for each chromatography
2537    for method in chromatography_methods:
2538        insert = biological_standards_table.insert().values({
2539            "name": name,
2540            "identifier": identifier,
2541            "chromatography": method,
2542            "num_pos_features": 0,
2543            "num_neg_features": 0,
2544            "msdial_config_id": "Default"
2545        })
2546        connection.execute(insert)
2547
2548    # Close the connection
2549    connection.close()

Creates new biological standard with name and identifier.

The biological standard identifier is a text substring used to distinguish between sample and biological standard. MS-AutoQC checks filenames in the sequence for this identifier to process samples accordingly.

Arguments:
  • name (str): Name of biological standard
  • identifier (str): String identifier in filename for biological standard
Returns:

None

def remove_biological_standard(name):
2552def remove_biological_standard(name):
2553
2554    """
2555    Deletes biological standard and corresponding MSPs from Settings database.
2556
2557    Args:
2558        name (str): Name of the biological standard
2559
2560    Returns:
2561        None
2562    """
2563
2564    # Delete corresponding MSPs from "methods" directory
2565    df = get_table("Settings", "biological_standards")
2566    df = df.loc[df["name"] == name]
2567    files_to_delete = df["pos_bio_msp_file"].astype(str).tolist() + df["neg_bio_msp_file"].astype(str).tolist()
2568
2569    for file in os.listdir(methods_directory):
2570        if name in files_to_delete:
2571            os.remove(os.path.join(methods_directory, file))
2572
2573    # Connect to database and get relevant tables
2574    db_metadata, connection = connect_to_database("Settings")
2575    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2576    targeted_features_table = sa.Table("targeted_features", db_metadata, autoload=True)
2577
2578    # Remove biological standard
2579    delete_biological_standard = (
2580        sa.delete(biological_standards_table)
2581            .where((biological_standards_table.c.name == name))
2582    )
2583    connection.execute(delete_biological_standard)
2584
2585    # Remove targeted features for that biological standard
2586    delete_targeted_features = (
2587        sa.delete(targeted_features_table)
2588            .where((targeted_features_table.c.biological_standard == name))
2589    )
2590    connection.execute(delete_targeted_features)
2591
2592    # Close the connection
2593    connection.close()

Deletes biological standard and corresponding MSPs from Settings database.

Arguments:
  • name (str): Name of the biological standard
Returns:

None

def update_msdial_config_for_bio_standard(biological_standard, chromatography, config_id):
2596def update_msdial_config_for_bio_standard(biological_standard, chromatography, config_id):
2597
2598    """
2599    Updates MS-DIAL configuration for given biological standard and chromatography method combination.
2600
2601    Args:
2602        biological_standard (str):
2603            Name of the biological standard
2604        chromatography (str):
2605            Chromatography method
2606        config_id (str):
2607            Name of MS-DIAL configuration to set for this biological standard - chromatography combination
2608
2609    Returns:
2610        None
2611    """
2612
2613    # Connect to database and get relevant tables
2614    db_metadata, connection = connect_to_database("Settings")
2615    biological_standards_table = sa.Table("biological_standards", db_metadata, autoload=True)
2616
2617    # Update MS-DIAL configuration for biological standard
2618    update_msdial_config = (
2619        sa.update(biological_standards_table)
2620            .where((biological_standards_table.c.name == biological_standard)
2621                   & (biological_standards_table.c.chromatography == chromatography))
2622            .values(msdial_config_id=config_id)
2623    )
2624
2625    connection.execute(update_msdial_config)
2626    connection.close()

Updates MS-DIAL configuration for given biological standard and chromatography method combination.

Arguments:
  • biological_standard (str): Name of the biological standard
  • chromatography (str): Chromatography method
  • config_id (str): Name of MS-DIAL configuration to set for this biological standard - chromatography combination
Returns:

None

def get_biological_standard_identifiers(bio_standards=None):
2629def get_biological_standard_identifiers(bio_standards=None):
2630
2631    """
2632    Returns dictionary of identifiers for a given list of biological standards.
2633
2634    If no list is provided, returns dict of identifiers for all biological standards.
2635
2636    Args:
2637        bio_standards (list, default None): List of biological standards
2638
2639    Returns:
2640        Dictionary with key-value pairs of { identifier: biological_standard }
2641    """
2642
2643    df_bio_standards = get_biological_standards()
2644
2645    identifiers = {}
2646
2647    if bio_standards is not None:
2648        if len(bio_standards) > 0:
2649            for bio_standard in bio_standards:
2650                df = df_bio_standards.loc[df_bio_standards["name"] == bio_standard]
2651                identifier = df["identifier"].astype(str).unique().tolist()[0]
2652                identifiers[identifier] = bio_standard
2653    else:
2654        names = df_bio_standards["name"].astype(str).unique().tolist()
2655        ids = df_bio_standards["identifier"].astype(str).unique().tolist()
2656        for index, name in enumerate(names):
2657            identifiers[ids[index]] = names[index]
2658
2659    return identifiers

Returns dictionary of identifiers for a given list of biological standards.

If no list is provided, returns dict of identifiers for all biological standards.

Arguments:
  • bio_standards (list, default None): List of biological standards
Returns:

Dictionary with key-value pairs of { identifier: biological_standard }

def get_qc_configurations():
2662def get_qc_configurations():
2663
2664    """
2665    Returns DataFrame of "qc_parameters" table from Settings database.
2666    """
2667
2668    engine = sa.create_engine(settings_database)
2669    return pd.read_sql("SELECT * FROM qc_parameters", engine)

Returns DataFrame of "qc_parameters" table from Settings database.

def get_qc_configurations_list():
2672def get_qc_configurations_list():
2673
2674    """
2675    Returns list of names of QC configurations from Settings database.
2676    """
2677
2678    return get_qc_configurations()["config_name"].astype(str).tolist()

Returns list of names of QC configurations from Settings database.

def add_qc_configuration(qc_config_name):
2681def add_qc_configuration(qc_config_name):
2682
2683    """
2684    Adds a new QC configuration to the "qc_parameters" table in the Settings database.
2685
2686    Args:
2687        qc_config_name (str): Name of the QC configuration
2688
2689    Returns:
2690        None
2691    """
2692
2693    # Connect to database
2694    db_metadata, connection = connect_to_database("Settings")
2695
2696    # Get QC parameters table
2697    qc_parameters_table = sa.Table("qc_parameters", db_metadata, autoload=True)
2698
2699    # Prepare insert of user-inputted run data
2700    insert_config = qc_parameters_table.insert().values(
2701        {"config_name": qc_config_name,
2702         "intensity_dropouts_cutoff": 4,
2703         "library_rt_shift_cutoff": 0.1,
2704         "in_run_rt_shift_cutoff": 0.05,
2705         "library_mz_shift_cutoff": 0.005,
2706         "intensity_enabled": True,
2707         "library_rt_enabled": True,
2708         "in_run_rt_enabled": True,
2709         "library_mz_enabled": True}
2710    )
2711
2712    # Execute INSERT to database, then close the connection
2713    connection.execute(insert_config)
2714    connection.close()

Adds a new QC configuration to the "qc_parameters" table in the Settings database.

Arguments:
  • qc_config_name (str): Name of the QC configuration
Returns:

None

def remove_qc_configuration(qc_config_name):
2717def remove_qc_configuration(qc_config_name):
2718
2719    """
2720    Deletes QC configuration from the "qc_parameters" table in the Settings database.
2721
2722    Args:
2723        qc_config_name (str): Name of the QC configuration
2724
2725    Returns:
2726        None
2727    """
2728
2729    # Connect to database
2730    db_metadata, connection = connect_to_database("Settings")
2731
2732    # Get QC parameters table
2733    qc_parameters_table = sa.Table("qc_parameters", db_metadata, autoload=True)
2734
2735    # Prepare DELETE of MS-DIAL configuration
2736    delete_config = (
2737        sa.delete(qc_parameters_table)
2738            .where(qc_parameters_table.c.config_name == qc_config_name)
2739    )
2740
2741    # Execute DELETE, then close the connection
2742    connection.execute(delete_config)
2743    connection.close()

Deletes QC configuration from the "qc_parameters" table in the Settings database.

Arguments:
  • qc_config_name (str): Name of the QC configuration
Returns:

None

def get_qc_configuration_parameters(config_name=None, instrument_id=None, run_id=None):
2746def get_qc_configuration_parameters(config_name=None, instrument_id=None, run_id=None):
2747
2748    """
2749    Returns DataFrame of parameters for a selected QC configuration.
2750
2751    The DataFrame has columns for each parameter, as well as for whether the parameter is enabled.
2752
2753    Args:
2754        config_name (str, default None):
2755            Name of QC configuration
2756        instrument_id (str, default None):
2757            Instrument ID (name)
2758        run_id (str, default None):
2759            Instrument run ID (job ID)
2760
2761    Returns:
2762        DataFrame of parameters for QC configuration.
2763    """
2764
2765    df_configurations = get_table("Settings", "qc_parameters")
2766
2767    # Get selected configuration
2768    if config_name is not None:
2769        selected_config = df_configurations.loc[df_configurations["config_name"] == config_name]
2770
2771    elif instrument_id is not None and run_id is not None:
2772        df_runs = get_table(instrument_id, "runs")
2773        config_name = df_runs.loc[df_runs["run_id"] == run_id]["qc_config_id"].values[0]
2774        selected_config = df_configurations.loc[
2775            df_configurations["config_name"] == config_name]
2776
2777    selected_config.drop(inplace=True, columns=["id", "config_name"])
2778
2779    # Probably not the most efficient way to do this...
2780    for column in ["intensity_enabled", "library_rt_enabled", "in_run_rt_enabled", "library_mz_enabled"]:
2781        selected_config.loc[selected_config[column] == 1, column] = True
2782        selected_config.loc[selected_config[column] == 0, column] = False
2783
2784    # Return parameters of selected configuration as a tuple
2785    return selected_config

Returns DataFrame of parameters for a selected QC configuration.

The DataFrame has columns for each parameter, as well as for whether the parameter is enabled.

Arguments:
  • config_name (str, default None): Name of QC configuration
  • instrument_id (str, default None): Instrument ID (name)
  • run_id (str, default None): Instrument run ID (job ID)
Returns:

DataFrame of parameters for QC configuration.

def update_qc_configuration( config_name, intensity_dropouts_cutoff, library_rt_shift_cutoff, in_run_rt_shift_cutoff, library_mz_shift_cutoff, intensity_enabled, library_rt_enabled, in_run_rt_enabled, library_mz_enabled):
2788def update_qc_configuration(config_name, intensity_dropouts_cutoff, library_rt_shift_cutoff, in_run_rt_shift_cutoff,
2789    library_mz_shift_cutoff, intensity_enabled, library_rt_enabled, in_run_rt_enabled, library_mz_enabled):
2790
2791    """
2792    Updates parameters for the given QC configuration.
2793
2794    Due to the database schema, booleans are stored as integers: 0 for False and 1 for True. They need to be
2795    cast back to booleans in get_qc_configuration_parameters(). A schema change would remove the bloat.
2796
2797    Args:
2798        config_name (str):
2799            Name of QC configuration
2800        intensity_dropouts_cutoff (int):
2801            Minimum number of internal standard intensity dropouts to constitute a QC fail
2802        library_rt_shift_cutoff (float):
2803            Maximum shift from library RT values to constitute a QC fail
2804        in_run_rt_shift_cutoff (float):
2805            Maximum shift from in-run RT values to constitute a QC fail
2806        library_mz_shift_cutoff (float):
2807            Maximum shift from library m/z values to constitute a QC fail
2808        intensity_enabled (bool):
2809            Enables / disables QC check for intensity dropout cutoffs
2810        library_rt_enabled (bool):
2811            Enables / disables QC check for library RT shifts
2812        in_run_rt_enabled (bool):
2813            Enables / disables QC check for in-run RT shifts
2814        library_mz_enabled (bool):
2815            Enables / disables QC check for library m/z shifts
2816
2817    Returns:
2818        None
2819    """
2820
2821    # Connect to database
2822    db_metadata, connection = connect_to_database("Settings")
2823
2824    # Get QC parameters table
2825    qc_parameters_table = sa.Table("qc_parameters", db_metadata, autoload=True)
2826
2827    # Prepare insert of user-inputted QC parameters
2828    update_parameters = (
2829        sa.update(qc_parameters_table)
2830            .where(qc_parameters_table.c.config_name == config_name)
2831            .values(intensity_dropouts_cutoff=intensity_dropouts_cutoff,
2832                    library_rt_shift_cutoff=library_rt_shift_cutoff,
2833                    in_run_rt_shift_cutoff=in_run_rt_shift_cutoff,
2834                    library_mz_shift_cutoff=library_mz_shift_cutoff,
2835                    intensity_enabled=intensity_enabled,
2836                    library_rt_enabled=library_rt_enabled,
2837                    in_run_rt_enabled=in_run_rt_enabled,
2838                    library_mz_enabled=library_mz_enabled)
2839    )
2840
2841    # Execute UPDATE to database, then close the connection
2842    connection.execute(update_parameters)
2843    connection.close()

Updates parameters for the given QC configuration.

Due to the database schema, booleans are stored as integers: 0 for False and 1 for True. They need to be cast back to booleans in get_qc_configuration_parameters(). A schema change would remove the bloat.

Arguments:
  • config_name (str): Name of QC configuration
  • intensity_dropouts_cutoff (int): Minimum number of internal standard intensity dropouts to constitute a QC fail
  • library_rt_shift_cutoff (float): Maximum shift from library RT values to constitute a QC fail
  • in_run_rt_shift_cutoff (float): Maximum shift from in-run RT values to constitute a QC fail
  • library_mz_shift_cutoff (float): Maximum shift from library m/z values to constitute a QC fail
  • intensity_enabled (bool): Enables / disables QC check for intensity dropout cutoffs
  • library_rt_enabled (bool): Enables / disables QC check for library RT shifts
  • in_run_rt_enabled (bool): Enables / disables QC check for in-run RT shifts
  • library_mz_enabled (bool): Enables / disables QC check for library m/z shifts
Returns:

None

def get_samples_in_run(instrument_id, run_id, sample_type='Both'):
2846def get_samples_in_run(instrument_id, run_id, sample_type="Both"):
2847
2848    """
2849    Returns DataFrame of samples for a given instrument run from instrument database.
2850
2851    Args:
2852        instrument_id (str):
2853            Instrument ID
2854        run_id (str):
2855            Instrument run ID (job ID)
2856        sample_type (str):
2857            Sample type, either "Sample" or "Biological Standard" or "Both"
2858
2859    Returns:
2860        DataFrame of sample tables for a given instrument run.
2861    """
2862
2863    if sample_type == "Sample":
2864        df = get_table(instrument_id, "sample_qc_results")
2865
2866    elif sample_type == "Biological Standard":
2867        df = get_table(instrument_id, "bio_qc_results")
2868
2869    elif sample_type == "Both":
2870        df_samples = get_table(instrument_id, "sample_qc_results")
2871        df_bio_standards = get_table(instrument_id, "bio_qc_results")
2872        df_bio_standards.drop(columns=["biological_standard"], inplace=True)
2873        df = df_bio_standards.append(df_samples, ignore_index=True)
2874
2875    return df.loc[df["run_id"] == run_id]

Returns DataFrame of samples for a given instrument run from instrument database.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • sample_type (str): Sample type, either "Sample" or "Biological Standard" or "Both"
Returns:

DataFrame of sample tables for a given instrument run.

def get_samples_from_csv(instrument_id, run_id, sample_type='Both'):
2878def get_samples_from_csv(instrument_id, run_id, sample_type="Both"):
2879
2880    """
2881    Returns DataFrame of samples in a given run using CSV files from Google Drive.
2882
2883    CSV files of the run metadata, samples, and biological standards tables are stored
2884    in the ../data/Instrument_ID_Run_ID/csv directory, and removed on job completion.
2885
2886    Args:
2887        instrument_id (str):
2888            Instrument ID
2889        run_id (str):
2890            Instrument run ID (job ID)
2891        sample_type (str):
2892            Sample type, either "Sample" or "Biological Standard" or "Both"
2893
2894    Returns:
2895        DataFrame of samples for a given instrument run.
2896    """
2897
2898    id = instrument_id.replace(" ", "_") + "_" + run_id
2899    csv_directory = os.path.join(data_directory, id, "csv")
2900
2901    samples_csv = os.path.join(csv_directory, "samples.csv")
2902    bio_standards_csv = os.path.join(csv_directory, "bio_standards.csv")
2903
2904    if sample_type == "Sample":
2905        df = pd.read_csv(samples_csv, index_col=False)
2906
2907    elif sample_type == "Biological Standard":
2908        df = pd.read_csv(bio_standards_csv, index_col=False)
2909
2910    elif sample_type == "Both":
2911        df_samples = pd.read_csv(samples_csv, index_col=False)
2912        df_bio_standards = pd.read_csv(bio_standards_csv, index_col=False)
2913        df_bio_standards.drop(columns=["biological_standard"], inplace=True)
2914        df = df_bio_standards.append(df_samples, ignore_index=True)
2915
2916    df = df.loc[df["run_id"] == run_id]
2917
2918    try:
2919        df.drop(columns=["id"], inplace=True)
2920    finally:
2921        return df

Returns DataFrame of samples in a given run using CSV files from Google Drive.

CSV files of the run metadata, samples, and biological standards tables are stored in the ../data/Instrument_ID_Run_ID/csv directory, and removed on job completion.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • sample_type (str): Sample type, either "Sample" or "Biological Standard" or "Both"
Returns:

DataFrame of samples for a given instrument run.

def get_next_sample(sample_id, instrument_id, run_id):
2924def get_next_sample(sample_id, instrument_id, run_id):
2925
2926    """
2927    Returns sample following the given sample, or None if last sample.
2928
2929    Args:
2930        sample_id (str):
2931            Sample ID
2932        instrument_id (str):
2933            Instrument ID
2934        run_id (str):
2935            Instrument run ID (job ID)
2936
2937    Returns:
2938        str: The next sample in the instrument run after the given sample ID, or None if last sample.
2939    """
2940
2941    # Get list of samples in run
2942    samples = get_samples_in_run(instrument_id, run_id, "Both")["sample_id"].astype(str).tolist()
2943
2944    # Find sample in list
2945    sample_index = samples.index(sample_id)
2946    next_sample_index = sample_index + 1
2947
2948    # Return next sample
2949    if next_sample_index != len(samples):
2950        return samples[next_sample_index]
2951    else:
2952        return None

Returns sample following the given sample, or None if last sample.

Arguments:
  • sample_id (str): Sample ID
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

str: The next sample in the instrument run after the given sample ID, or None if last sample.

def get_remaining_samples(instrument_id, run_id):
2955def get_remaining_samples(instrument_id, run_id):
2956
2957    """
2958    Returns list of samples remaining in a given instrument run (QC job).
2959
2960    TODO: This function should just return the samples with null values in the "qc_result" column.
2961        The "latest_sample" value in the "runs" table may be unreliable.
2962
2963    Args:
2964        instrument_id (str):
2965            Instrument ID
2966        run_id (str):
2967            Instrument run ID (job ID)
2968
2969    Returns:
2970        list: List of samples remaining in a QC job.
2971    """
2972
2973    # Get last processed sample in run
2974    df_run = get_instrument_run(instrument_id, run_id)
2975    latest_sample = df_run["latest_sample"].astype(str).values[0]
2976
2977    # Get list of samples in run
2978    samples = get_samples_in_run(instrument_id, run_id, "Both")["sample_id"].astype(str).tolist()
2979
2980    # Return all samples if beginning of run
2981    if latest_sample == "None":
2982        return samples
2983
2984    # Get index of latest sample
2985    latest_sample_index = samples.index(latest_sample)
2986
2987    # Return list of samples starting at latest sample
2988    return samples[latest_sample_index:len(samples)]

Returns list of samples remaining in a given instrument run (QC job).

TODO: This function should just return the samples with null values in the "qc_result" column. The "latest_sample" value in the "runs" table may be unreliable.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

list: List of samples remaining in a QC job.

def get_unprocessed_samples(instrument_id, run_id):
2991def get_unprocessed_samples(instrument_id, run_id):
2992
2993    """
2994    For an active run, returns 1) a list of samples that were not processed due to error / runtime termination,
2995    and 2) the current sample being monitored / processed.
2996
2997    Args:
2998        instrument_id (str):
2999            Instrument ID
3000        run_id (str):
3001            Instrument run ID (job ID)
3002
3003    Returns:
3004        tuple: List of unprocessed samples for the given instrument run, and current sample being monitored / processed.
3005    """
3006
3007    # Get samples in run
3008    df_samples = get_samples_in_run(instrument_id, run_id, "Both")
3009
3010    # Get list of samples in run
3011    samples = df_samples["sample_id"].astype(str).tolist()
3012
3013    # Construct dictionary of unprocessed samples in instrument run
3014    df_unprocessed_samples = df_samples.loc[df_samples["qc_result"].isnull()]
3015    unprocessed_samples = df_unprocessed_samples["sample_id"].astype(str).tolist()
3016
3017    # Get acquisition path, data files, and data file extension
3018    acquisition_path = get_acquisition_path(instrument_id, run_id)
3019    extension = get_data_file_type(instrument_id)
3020    directory_files = os.listdir(acquisition_path)
3021    data_files = [file.split(".")[0] for file in directory_files if file.split(".")[0] in unprocessed_samples]
3022
3023    # Mark acquired data files
3024    df_unprocessed_samples.loc[
3025        df_unprocessed_samples["sample_id"].isin(data_files), "found"] = "Found"
3026    unprocessed_samples = df_unprocessed_samples.dropna(subset=["found"])["sample_id"].astype(str).tolist()
3027
3028    # Get current sample
3029    if len(unprocessed_samples) > 0:
3030        current_sample = unprocessed_samples[-1]
3031        del unprocessed_samples[-1]
3032    else:
3033        current_sample = None
3034
3035    # Return as tuple
3036    return unprocessed_samples, current_sample

For an active run, returns 1) a list of samples that were not processed due to error / runtime termination, and 2) the current sample being monitored / processed.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

tuple: List of unprocessed samples for the given instrument run, and current sample being monitored / processed.

def get_current_sample(instrument_id, run_id):
3039def get_current_sample(instrument_id, run_id):
3040
3041    """
3042    Returns the current sample being monitored / processed.
3043
3044    TODO: The "latest_sample" is the last sample to be processed. Nomenclature needs to be updated in many places.
3045
3046    Args:
3047        instrument_id (str):
3048            Instrument ID
3049        run_id (str):
3050            Instrument run ID (job ID)
3051
3052    Returns:
3053        str: Current sample being monitored / processed.
3054    """
3055
3056    # Get latest sample in run
3057    df_run = get_instrument_run(instrument_id, run_id)
3058    latest_sample = df_run["latest_sample"].astype(str).values[0]
3059
3060    # Return second sample if beginning of run
3061    if latest_sample == "None":
3062        return samples[1]

Returns the current sample being monitored / processed.

TODO: The "latest_sample" is the last sample to be processed. Nomenclature needs to be updated in many places.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

str: Current sample being monitored / processed.

def parse_internal_standard_data( instrument_id, run_id, result_type, polarity, load_from, as_json=True):
3065def parse_internal_standard_data(instrument_id, run_id, result_type, polarity, load_from, as_json=True):
3066
3067    """
3068    Parses data from database into JSON-ified DataFrame for samples (as rows) vs. internal standards (as columns).
3069
3070    Data is stored in a column (for example, "retention_time") as a single-record string dict with the following structure:
3071
3072    | Sample     | iSTD 1 | iSTD 2 | ... |
3073    | ---------- | ------ | ------ | ... |
3074    | SAMPLE_001 | 1.207  | 1.934  | ... |
3075
3076    These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().
3077
3078    Args:
3079        instrument_id (str):
3080            Instrument ID
3081        run_id (str):
3082            Instrument run ID (job ID)
3083        result_type (str):
3084            Column in sample_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
3085        polarity (str):
3086            Polarity ("Pos" or "Neg")
3087        load_from (str):
3088            Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
3089        as_json (bool, default True):
3090            Whether to return table as JSON string or as DataFrame
3091
3092    Returns:
3093        DataFrame of samples (rows) vs. internal standards (columns) as JSON string.
3094    """
3095
3096    # Get relevant QC results table from database
3097    if load_from == "database" or load_from == "processing":
3098        df_samples = get_samples_in_run(instrument_id, run_id, "Sample")
3099    elif load_from == "csv":
3100        df_samples = get_samples_from_csv(instrument_id, run_id, "Sample")
3101
3102    # Filter by polarity
3103    df_samples = df_samples.loc[df_samples["polarity"] == polarity]
3104    sample_ids = df_samples["sample_id"].astype(str).tolist()
3105
3106    # Return None if results are None
3107    if load_from == "processing":
3108        if len(df_samples[result_type].dropna()) == 0:
3109            return None
3110
3111    # Initialize DataFrame with individual records of sample data
3112    results = df_samples[result_type].astype(str).tolist()
3113    results = [ast.literal_eval(result) if result != "None" and result != "nan" else {} for result in results]
3114    df_results = pd.DataFrame(results)
3115    df_results.drop(columns=["Name"], inplace=True)
3116    df_results["Sample"] = sample_ids
3117
3118    # Return DataFrame as JSON string
3119    if as_json:
3120        return df_results.to_json(orient="records")
3121    else:
3122        return df_results

Parses data from database into JSON-ified DataFrame for samples (as rows) vs. internal standards (as columns).

Data is stored in a column (for example, "retention_time") as a single-record string dict with the following structure:

| Sample | iSTD 1 | iSTD 2 | ... | | ---------- | ------ | ------ | ... | | SAMPLE_001 | 1.207 | 1.934 | ... |

These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • result_type (str): Column in sample_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
  • polarity (str): Polarity ("Pos" or "Neg")
  • load_from (str): Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
  • as_json (bool, default True): Whether to return table as JSON string or as DataFrame
Returns:

DataFrame of samples (rows) vs. internal standards (columns) as JSON string.

def parse_biological_standard_data( instrument_id, run_id, result_type, polarity, biological_standard, load_from, as_json=True):
3125def parse_biological_standard_data(instrument_id, run_id, result_type, polarity, biological_standard, load_from, as_json=True):
3126
3127    """
3128    Parses biological standard data into JSON-ified DataFrame of targeted features (as columns) vs. instrument runs (as rows).
3129
3130    The bio_qc_results table in the instrument database is first filtered by biological standard, chromatography, and polarity.
3131    Then, the sample name is replaced with the instrument run it was associated with.
3132
3133    Data is stored in a column (for example, "intensity") as a single-record string dict with the following structure:
3134
3135    | Name                | Metabolite 1 | Metabolite 2 | ... |
3136    | ------------------- | ------------ | ------------ | ... |
3137    | INSTRUMENT_RUN_001  | 13597340     | 53024853     | ... |
3138
3139    These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().
3140
3141    | Name                | Metabolite 1 | Metabolite 2 | ... |
3142    | ------------------- | ------------ | ------------ | ... |
3143    | INSTRUMENT_RUN_001  | 13597340     | 53024853     | ... |
3144    | INSTRUMENT_RUN_002  | 23543246     | 102030406    | ... |
3145    | ...                 | ...          | ...          | ... |
3146
3147    Args:
3148        instrument_id (str):
3149            Instrument ID
3150        run_id (str):
3151            Instrument run ID (job ID)
3152        result_type (str):
3153            Column in bio_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
3154        polarity (str):
3155            Polarity ("Pos" or "Neg")
3156        biological_standard (str):
3157            Name of biological standard
3158        load_from (str):
3159            Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
3160        as_json (bool, default True):
3161            Whether to return table as JSON string or as DataFrame
3162
3163    Returns:
3164        JSON-ified DataFrame of targeted features for a biological standard (columns) vs. instrument runs (rows).
3165    """
3166
3167    # Get relevant QC results table from database
3168    if load_from == "database":
3169        df_samples = get_table(instrument_id, "bio_qc_results")
3170    elif load_from == "csv":
3171        id = instrument_id.replace(" ", "_") + "_" + run_id
3172        bio_standards_csv = os.path.join(data_directory, id, "csv", "bio_standards.csv")
3173        df_samples = pd.read_csv(bio_standards_csv, index_col=False)
3174
3175    # Filter by biological standard type
3176    df_samples = df_samples.loc[df_samples["biological_standard"] == biological_standard]
3177
3178    # Filter by polarity
3179    df_samples = df_samples.loc[df_samples["polarity"] == polarity]
3180
3181    # Filter by instrument
3182    df_runs = get_table(instrument_id, "runs")
3183    chromatography = df_runs.loc[df_runs["run_id"] == run_id]["chromatography"].values[0]
3184
3185    # Filter by chromatography
3186    run_ids = df_runs.loc[df_runs["chromatography"] == chromatography]["run_id"].astype(str).tolist()
3187    df_samples = df_samples.loc[df_samples["run_id"].isin(run_ids)]
3188    run_ids = df_samples["run_id"].astype(str).tolist()
3189
3190    # Initialize DataFrame with individual records of sample data
3191    results = df_samples[result_type].fillna('{}').tolist()
3192    results = [ast.literal_eval(result) if result != "None" and result != "nan" else {} for result in results]
3193    df_results = pd.DataFrame(results)
3194    df_results["Name"] = run_ids
3195
3196    # Return DataFrame as JSON string
3197    if as_json:
3198        return df_results.to_json(orient="records")
3199    else:
3200        return df_results

Parses biological standard data into JSON-ified DataFrame of targeted features (as columns) vs. instrument runs (as rows).

The bio_qc_results table in the instrument database is first filtered by biological standard, chromatography, and polarity. Then, the sample name is replaced with the instrument run it was associated with.

Data is stored in a column (for example, "intensity") as a single-record string dict with the following structure:

| Name | Metabolite 1 | Metabolite 2 | ... | | ------------------- | ------------ | ------------ | ... | | INSTRUMENT_RUN_001 | 13597340 | 53024853 | ... |

These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().

| Name | Metabolite 1 | Metabolite 2 | ... | | ------------------- | ------------ | ------------ | ... | | INSTRUMENT_RUN_001 | 13597340 | 53024853 | ... | | INSTRUMENT_RUN_002 | 23543246 | 102030406 | ... | | ... | ... | ... | ... |

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • result_type (str): Column in bio_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
  • polarity (str): Polarity ("Pos" or "Neg")
  • biological_standard (str): Name of biological standard
  • load_from (str): Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
  • as_json (bool, default True): Whether to return table as JSON string or as DataFrame
Returns:

JSON-ified DataFrame of targeted features for a biological standard (columns) vs. instrument runs (rows).

def parse_internal_standard_qc_data( instrument_id, run_id, polarity, result_type, load_from, as_json=True):
3203def parse_internal_standard_qc_data(instrument_id, run_id, polarity, result_type, load_from, as_json=True):
3204
3205    """
3206    Parses QC data into JSON-ified DataFrame for samples (as rows) vs. internal standards (as columns).
3207
3208    The QC DataFrame is stored in the "qc_dataframe" column as a single-record string dict with the following structure:
3209
3210    | Sample     | Delta m/z | Delta RT | In-run delta RT | Warnings | Fails |
3211    | ---------- | --------- | -------- | --------------- | -------- | ----- |
3212    | SAMPLE_001 | 0.000001  | 0.001    | 0.00001         | None     | None  |
3213
3214    These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().
3215
3216    Args:
3217        instrument_id (str):
3218            Instrument ID
3219        run_id (str):
3220            Instrument run ID (job ID)
3221        polarity (str):
3222            Polarity ("Pos" or "Neg")
3223        result_type (str):
3224            Column in sample_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
3225        load_from (str):
3226            Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
3227        as_json (bool, default True):
3228            Whether to return table as JSON string or as DataFrame
3229
3230    Returns:
3231        JSON-ified DataFrame of QC data for samples (as rows) vs. internal standards (as columns).
3232    """
3233
3234    # Get relevant QC results table from database
3235    if load_from == "database" or load_from == "processing":
3236        df_samples = get_samples_in_run(instrument_id, run_id, "Sample")
3237    elif load_from == "csv":
3238        df_samples = get_samples_from_csv(instrument_id, run_id, "Sample")
3239
3240    # Filter by polarity
3241    df_samples = df_samples.loc[df_samples["polarity"] == polarity]
3242
3243    # For results DataFrame, each index corresponds to the result type
3244    get_result_index = {
3245        "Delta m/z": 0,
3246        "Delta RT": 1,
3247        "In-run delta RT": 2,
3248        "Intensity dropout": 3,
3249        "Warnings": 4,
3250        "Fails": 5
3251    }
3252
3253    # Get list of results using result type
3254    sample_ids = df_samples["sample_id"].astype(str).tolist()
3255    results = df_samples["qc_dataframe"].fillna('[{}, {}, {}, {}, {}, {}]').astype(str).tolist()
3256
3257    type_index = get_result_index[result_type]
3258    results = [ast.literal_eval(result)[type_index] for result in results]
3259    df_results = pd.DataFrame(results)
3260    df_results.drop(columns=["Name"], inplace=True)
3261    df_results["Sample"] = sample_ids
3262
3263    # Return DataFrame as JSON string
3264    if as_json:
3265        return df_results.to_json(orient="records")
3266    else:
3267        return df_results

Parses QC data into JSON-ified DataFrame for samples (as rows) vs. internal standards (as columns).

The QC DataFrame is stored in the "qc_dataframe" column as a single-record string dict with the following structure:

Sample Delta m/z Delta RT In-run delta RT Warnings Fails
SAMPLE_001 0.000001 0.001 0.00001 None None

These records are concatenated together with this function using pd.DataFrame(), which is 100x faster than pd.concat().

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • polarity (str): Polarity ("Pos" or "Neg")
  • result_type (str): Column in sample_qc_results table to parse (either "retention_time" or "precursor_mz" or "intensity")
  • load_from (str): Specifies whether to load data from CSV file (during Google Drive sync of active run) or instrument database
  • as_json (bool, default True): Whether to return table as JSON string or as DataFrame
Returns:

JSON-ified DataFrame of QC data for samples (as rows) vs. internal standards (as columns).

def get_workspace_users_list():
3270def get_workspace_users_list():
3271
3272    """
3273    Returns a list of users that have access to the MS-AutoQC workspace.
3274    """
3275
3276    return get_table("Settings", "gdrive_users")["email_address"].astype(str).tolist()

Returns a list of users that have access to the MS-AutoQC workspace.

def add_user_to_workspace(email_address):
3279def add_user_to_workspace(email_address):
3280
3281    """
3282    Gives user access to workspace in Google Drive and stores email address in database.
3283
3284    Access is granted by sharing the MS-AutoQC folder in Google Drive with the user's Google account.
3285
3286    Args:
3287        email_address (str): Email address for Google account to grant access to workspace.
3288
3289    Returns:
3290        None
3291    """
3292
3293    if email_address in get_workspace_users_list():
3294        return "User already exists"
3295
3296    # Get Google Drive instance
3297    drive = get_drive_instance()
3298
3299    # Get ID of MS-AutoQC folder in Google Drive
3300    gdrive_folder_id = get_drive_folder_id()
3301
3302    if gdrive_folder_id is not None:
3303        # Add user access by updating permissions
3304        folder = drive.CreateFile({"id": gdrive_folder_id})
3305        permission = folder.InsertPermission({
3306            "type": "user",
3307            "role": "writer",
3308            "value": email_address})
3309
3310        # Insert user email address in "gdrive_users" table
3311        db_metadata, connection = connect_to_database("Settings")
3312        gdrive_users_table = sa.Table("gdrive_users", db_metadata, autoload=True)
3313
3314        insert_user_email = gdrive_users_table.insert().values(
3315            {"name": permission["name"],
3316             "email_address": email_address,
3317             "permission_id": permission["id"]})
3318
3319        connection.execute(insert_user_email)
3320        connection.close()
3321
3322    else:
3323        return "Error"

Gives user access to workspace in Google Drive and stores email address in database.

Access is granted by sharing the MS-AutoQC folder in Google Drive with the user's Google account.

Arguments:
  • email_address (str): Email address for Google account to grant access to workspace.
Returns:

None

def delete_user_from_workspace(email_address):
3326def delete_user_from_workspace(email_address):
3327
3328    """
3329    Removes user access to workspace in Google Drive and deletes email from database.
3330
3331    Args:
3332        email_address (str): Email address for Google account whose access will to be revoked.
3333
3334    Returns:
3335        None
3336    """
3337
3338    if email_address not in get_workspace_users_list():
3339        return "User does not exist"
3340
3341    # Get Google Drive instance
3342    drive = get_drive_instance()
3343
3344    # Get ID of MS-AutoQC folder in Google Drive
3345    gdrive_folder_id = get_drive_folder_id()
3346
3347    if gdrive_folder_id is not None:
3348        # Get permission ID of user from database
3349        folder = drive.CreateFile({"id": gdrive_folder_id})
3350        df_gdrive_users = get_table("Settings", "gdrive_users")
3351        df_gdrive_users = df_gdrive_users.loc[df_gdrive_users["email_address"] == email_address]
3352        permission_id = df_gdrive_users["permission_id"].astype(str).values[0]
3353
3354        # Delete user access by updating permissions
3355        folder.DeletePermission(permission_id)
3356
3357        # Delete user email address in "gdrive_users" table
3358        db_metadata, connection = connect_to_database("Settings")
3359        gdrive_users_table = sa.Table("gdrive_users", db_metadata, autoload=True)
3360
3361        delete_user_email = (
3362            sa.delete(gdrive_users_table)
3363                .where((gdrive_users_table.c.email_address == email_address))
3364        )
3365
3366        connection.execute(delete_user_email)
3367        connection.close()
3368
3369    else:
3370        return "Error"

Removes user access to workspace in Google Drive and deletes email from database.

Arguments:
  • email_address (str): Email address for Google account whose access will to be revoked.
Returns:

None

def get_qc_results(instrument_id, sample_list, is_bio_standard=False):
3373def get_qc_results(instrument_id, sample_list, is_bio_standard=False):
3374
3375    """
3376    Returns DataFrame of QC results for a given sample list.
3377
3378    TODO: This function will break if samples in different runs have the same sample ID. Add run ID filter.
3379
3380    Args:
3381        instrument_id (str):
3382            Instrument ID
3383        sample_list (list):
3384            List of samples to query
3385        is_bio_standard (bool, default False):
3386            Whether the list is biological standards (True) or samples (False)
3387
3388    Returns:
3389        DataFrame of QC results for a given sample list.
3390    """
3391
3392    if len(sample_list) == 0:
3393        return pd.DataFrame()
3394
3395    database = get_database_file(instrument_id=instrument_id, sqlite_conn=True)
3396    engine = sa.create_engine(database)
3397
3398    sample_list = str(sample_list).replace("[", "(").replace("]", ")")
3399
3400    if is_bio_standard:
3401        query = "SELECT sample_id, qc_result FROM bio_qc_results WHERE sample_id in " + sample_list
3402    else:
3403        query = "SELECT sample_id, qc_result FROM sample_qc_results WHERE sample_id in " + sample_list
3404
3405    return pd.read_sql(query, engine)

Returns DataFrame of QC results for a given sample list.

TODO: This function will break if samples in different runs have the same sample ID. Add run ID filter.

Arguments:
  • instrument_id (str): Instrument ID
  • sample_list (list): List of samples to query
  • is_bio_standard (bool, default False): Whether the list is biological standards (True) or samples (False)
Returns:

DataFrame of QC results for a given sample list.

def create_workspace_metadata():
3408def create_workspace_metadata():
3409
3410    """
3411    Creates record in "workspace" table to store various metadata.
3412    """
3413
3414    db_metadata, connection = connect_to_database("Settings")
3415    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3416    connection.execute(workspace_table.insert().values({"id": 1}))
3417    connection.close()

Creates record in "workspace" table to store various metadata.

def get_device_identity():
3420def get_device_identity():
3421
3422    """
3423    Returns device identity (either an Instrument ID or "Shared user").
3424    """
3425
3426    return get_table("Settings", "workspace")["instrument_identity"].astype(str).tolist()[0]

Returns device identity (either an Instrument ID or "Shared user").

def set_device_identity(is_instrument_computer, instrument_id):
3429def set_device_identity(is_instrument_computer, instrument_id):
3430
3431    """
3432    Indicates whether the user's device is the instrument PC or not.
3433
3434    Args:
3435        is_instrument_computer (bool):
3436            Whether the device is an instrument computer or not
3437        instrument_id (str):
3438            Instrument ID (if None, set to "Shared user")
3439
3440    Returns:
3441        None
3442    """
3443
3444    if not is_instrument_computer:
3445        instrument_id = "Shared user"
3446
3447    db_metadata, connection = connect_to_database("Settings")
3448    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3449
3450    update_identity = (
3451        sa.update(workspace_table)
3452            .where(workspace_table.c.id == 1)
3453            .values(
3454                is_instrument_computer=is_instrument_computer,
3455                instrument_identity=instrument_id
3456        )
3457    )
3458
3459    connection.execute(update_identity)
3460    connection.close()

Indicates whether the user's device is the instrument PC or not.

Arguments:
  • is_instrument_computer (bool): Whether the device is an instrument computer or not
  • instrument_id (str): Instrument ID (if None, set to "Shared user")
Returns:

None

def run_is_on_instrument_pc(instrument_id, run_id):
3463def run_is_on_instrument_pc(instrument_id, run_id):
3464
3465    """
3466    Validates that the current device is the instrument PC on which the run was started.
3467
3468    TODO: Use this function in PlotGeneration and DashWebApp module.
3469
3470    Args:
3471        instrument_id (str):
3472            Instrument ID
3473        run_id (str):
3474            Instrument run ID
3475
3476    Returns:
3477        True if instrument run was started on the current device, and False if not.
3478    """
3479
3480    instrument_id = get_instrument_run(instrument_id, run_id)["instrument_id"].astype(str).tolist()[0]
3481    device_identity = get_table("Settings", "workspace")["instrument_identity"].astype(str).tolist()[0]
3482
3483    if instrument_id == device_identity:
3484        return True
3485    else:
3486        return False

Validates that the current device is the instrument PC on which the run was started.

TODO: Use this function in PlotGeneration and DashWebApp module.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID
Returns:

True if instrument run was started on the current device, and False if not.

def update_slack_bot_token(slack_bot_token):
3489def update_slack_bot_token(slack_bot_token):
3490
3491    """
3492    Updates Slack bot user OAuth 2.0 token in "workspace" table of Settings database.
3493
3494    For details on the Slack API, see: https://slack.dev/python-slack-sdk/
3495
3496    Args:
3497        slack_bot_token (str): Slack bot user OAuth token
3498
3499    Returns:
3500        None
3501    """
3502
3503    db_metadata, connection = connect_to_database("Settings")
3504    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3505
3506    update_slack_bot_token = (
3507        sa.update(workspace_table)
3508            .where(workspace_table.c.id == 1)
3509            .values(slack_bot_token=slack_bot_token)
3510    )
3511
3512    connection.execute(update_slack_bot_token)
3513    connection.close()

Updates Slack bot user OAuth 2.0 token in "workspace" table of Settings database.

For details on the Slack API, see: https://slack.dev/python-slack-sdk/

Arguments:
  • slack_bot_token (str): Slack bot user OAuth token
Returns:

None

def get_slack_bot_token():
3516def get_slack_bot_token():
3517
3518    """
3519    Returns Slack bot token stored in "workspace" table of Settings database.
3520    """
3521
3522    return get_table("Settings", "workspace")["slack_bot_token"].astype(str).values[0]

Returns Slack bot token stored in "workspace" table of Settings database.

def update_slack_channel(slack_channel, notifications_enabled):
3525def update_slack_channel(slack_channel, notifications_enabled):
3526
3527    """
3528    Updates Slack channel registered for notifications in "workspace" table of Settings database.
3529
3530    Args:
3531        slack_channel (str):
3532            Slack channel to post messages to
3533        notifications_enabled (bool):
3534            Whether to send Slack notifications for QC warnings and fails
3535
3536    Returns:
3537        None
3538    """
3539
3540    db_metadata, connection = connect_to_database("Settings")
3541    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
3542
3543    update_slack_channel = (
3544        sa.update(workspace_table)
3545            .where(workspace_table.c.id == 1)
3546            .values(
3547                slack_channel=slack_channel.replace("#", ""),
3548                slack_enabled=notifications_enabled)
3549    )
3550
3551    connection.execute(update_slack_channel)
3552    connection.close()

Updates Slack channel registered for notifications in "workspace" table of Settings database.

Arguments:
  • slack_channel (str): Slack channel to post messages to
  • notifications_enabled (bool): Whether to send Slack notifications for QC warnings and fails
Returns:

None

def get_slack_channel():
3555def get_slack_channel():
3556
3557    """
3558    Returns Slack channel registered for notifications.
3559    """
3560
3561    return get_table("Settings", "workspace")["slack_channel"].astype(str).values[0]

Returns Slack channel registered for notifications.

def get_slack_notifications_toggled():
3564def get_slack_notifications_toggled():
3565
3566    """
3567    Returns Slack notification toggle setting.
3568    """
3569
3570    try:
3571        return get_table("Settings", "workspace")["slack_enabled"].astype(int).tolist()[0]
3572    except:
3573        return None

Returns Slack notification toggle setting.

def get_email_notifications_list(as_string=False):
3576def get_email_notifications_list(as_string=False):
3577
3578    """
3579    Returns list of emails registered for email notifications for QC warnings and fails.
3580
3581    Args:
3582        as_string (bool, default False):
3583            Whether to return the list as a string (for Gmail API) or as list object (for display in Settings page)
3584
3585    Returns:
3586        List of emails registered for QC warning/fail notifications.
3587    """
3588
3589    email_list = get_table("Settings", "email_notifications")["email_address"].astype(str).tolist()
3590
3591    if as_string:
3592        email_list_string = ""
3593
3594        for email in email_list:
3595            email_list_string += email
3596            if email != email_list[-1]:
3597                email_list_string += ","
3598
3599        return email_list_string
3600
3601    else:
3602        return email_list

Returns list of emails registered for email notifications for QC warnings and fails.

Arguments:
  • as_string (bool, default False): Whether to return the list as a string (for Gmail API) or as list object (for display in Settings page)
Returns:

List of emails registered for QC warning/fail notifications.

def register_email_for_notifications(email_address):
3605def register_email_for_notifications(email_address):
3606
3607    """
3608    Inserts email address into "email_notifications" table in Settings database.
3609
3610    Args:
3611        email_address (str): Email address to register for notifications.
3612
3613    Returns:
3614        None
3615    """
3616
3617    db_metadata, connection = connect_to_database("Settings")
3618    email_notifications_table = sa.Table("email_notifications", db_metadata, autoload=True)
3619
3620    insert_email_address = email_notifications_table.insert().values({
3621        "email_address": email_address
3622    })
3623
3624    connection.execute(insert_email_address)
3625    connection.close()

Inserts email address into "email_notifications" table in Settings database.

Arguments:
  • email_address (str): Email address to register for notifications.
Returns:

None

def delete_email_from_notifications(email_address):
3628def delete_email_from_notifications(email_address):
3629
3630    """
3631    Deletes email address from "email_notifications" table in Settings database.
3632
3633    Args:
3634        email_address (str): Email address to unsubscribe from notifications.
3635
3636    Returns:
3637        None
3638    """
3639
3640    db_metadata, connection = connect_to_database("Settings")
3641    email_notifications_table = sa.Table("email_notifications", db_metadata, autoload=True)
3642
3643    delete_email_address = (
3644        sa.delete(email_notifications_table)
3645            .where((email_notifications_table.c.email_address == email_address))
3646    )
3647
3648    connection.execute(delete_email_address)
3649    connection.close()

Deletes email address from "email_notifications" table in Settings database.

Arguments:
  • email_address (str): Email address to unsubscribe from notifications.
Returns:

None

def get_completed_samples_count(instrument_id, run_id, status):
3652def get_completed_samples_count(instrument_id, run_id, status):
3653
3654    """
3655    Returns tuple containing count for completed samples and total samples in a given instrument run.
3656
3657    Args:
3658        instrument_id (str):
3659            Instrument ID
3660        run_id (str):
3661            Instrument run ID (job ID)
3662        status (str):
3663            Instrument run (QC job) status, either "Active" or "Complete"
3664
3665    Returns:
3666        Tuple with number of completed samples and total samples for a given instrument run.
3667    """
3668
3669    if status == "Active" and sync_is_enabled():
3670        if get_device_identity() == instrument_id:
3671            df_instrument_run = get_instrument_run(instrument_id, run_id)
3672        else:
3673            df_instrument_run = get_instrument_run_from_csv(instrument_id, run_id)
3674    else:
3675        df_instrument_run = get_instrument_run(instrument_id, run_id)
3676
3677    completed = df_instrument_run["completed"].astype(int).tolist()[0]
3678    total_samples = df_instrument_run["samples"].astype(int).tolist()[0]
3679    return (completed, total_samples)

Returns tuple containing count for completed samples and total samples in a given instrument run.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • status (str): Instrument run (QC job) status, either "Active" or "Complete"
Returns:

Tuple with number of completed samples and total samples for a given instrument run.

def get_run_progress(instrument_id, run_id, status):
3682def get_run_progress(instrument_id, run_id, status):
3683
3684    """
3685    Returns progress of instrument run as a percentage of samples completed.
3686
3687    Args:
3688        instrument_id (str):
3689            Instrument ID
3690        run_id (str):
3691            Instrument run ID (job ID)
3692        status (str):
3693            Instrument run (QC job) status, either "Active" or "Complete"
3694
3695    Returns:
3696        float: Percent of samples processed for the given instrument run.
3697    """
3698
3699    completed, total_samples = get_completed_samples_count(instrument_id, run_id, status)
3700    percent_complete = (completed / total_samples) * 100
3701    return round(percent_complete, 1)

Returns progress of instrument run as a percentage of samples completed.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • status (str): Instrument run (QC job) status, either "Active" or "Complete"
Returns:

float: Percent of samples processed for the given instrument run.

def update_sample_counters_for_run(instrument_id, run_id, latest_sample):
3704def update_sample_counters_for_run(instrument_id, run_id, latest_sample):
3705
3706    """
3707    Increments "completed" count, as well as "pass" and "fail" counts accordingly.
3708
3709    TODO: The "latest_sample" is the last sample to be processed / completed.
3710        Nomenclature should be updated for clarity.
3711
3712    Args:
3713        instrument_id (str):
3714            Instrument ID
3715        run_id (str):
3716            Instrument run ID (job ID)
3717        latest_sample (str):
3718            Last sample to be processed
3719
3720    Returns:
3721        None
3722    """
3723
3724    df = get_samples_in_run(instrument_id, run_id, "Both")
3725
3726    try:
3727        passes = int(df["qc_result"].value_counts()["Pass"])
3728    except:
3729        passes = 0
3730
3731    try:
3732        warnings = int(df["qc_result"].value_counts()["Warning"])
3733    except:
3734        warnings = 0
3735
3736    try:
3737        fails = int(df["qc_result"].value_counts()["Fail"])
3738    except:
3739        fails = 0
3740
3741    completed = passes + fails
3742
3743    db_metadata, connection = connect_to_database(instrument_id)
3744    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3745
3746    update_status = (
3747        sa.update(instrument_runs_table)
3748            .where(instrument_runs_table.c.run_id == run_id)
3749            .values(
3750                completed=completed,
3751                passes=passes,
3752                fails=fails,
3753                latest_sample=latest_sample
3754        )
3755    )
3756
3757    connection.execute(update_status)
3758    connection.close()

Increments "completed" count, as well as "pass" and "fail" counts accordingly.

TODO: The "latest_sample" is the last sample to be processed / completed. Nomenclature should be updated for clarity.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • latest_sample (str): Last sample to be processed
Returns:

None

def mark_run_as_completed(instrument_id, run_id):
3761def mark_run_as_completed(instrument_id, run_id):
3762
3763    """
3764    Marks instrument run status as completed.
3765
3766    Args:
3767        instrument_id (str):
3768            Instrument ID
3769        run_id (str):
3770            Instrument run ID (job ID)
3771
3772    Returns:
3773        None
3774    """
3775
3776    db_metadata, connection = connect_to_database(instrument_id)
3777    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3778
3779    update_status = (
3780        sa.update(instrument_runs_table)
3781            .where(instrument_runs_table.c.run_id == run_id)
3782            .values(status="Complete")
3783    )
3784
3785    connection.execute(update_status)
3786    connection.close()

Marks instrument run status as completed.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def skip_sample(instrument_id, run_id):
3789def skip_sample(instrument_id, run_id):
3790
3791    """
3792    Skips sample by setting "latest_sample" value for instrument run to the next sample.
3793
3794    This function was used after restarting the acquisition listener when MS-DIAL got stuck processing a corrupted file.
3795    Now that MS-DIAL runs in the background, it is deprecated and should be removed.
3796
3797    Args:
3798        instrument_id (str):
3799            Instrument ID
3800        run_id (str):
3801            Instrument run ID (job ID)
3802
3803    Returns:
3804        None
3805    """
3806
3807    # Get next sample
3808    samples = get_remaining_samples(instrument_id, run_id)
3809    next_sample = samples[1]
3810
3811    # Set latest sample to next sample
3812    db_metadata, connection = connect_to_database(instrument_id)
3813    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3814
3815    connection.execute((
3816        sa.update(instrument_runs_table)
3817            .where(instrument_runs_table.c.run_id == run_id)
3818            .values(latest_sample=next_sample)
3819    ))
3820
3821    connection.close()

Skips sample by setting "latest_sample" value for instrument run to the next sample.

This function was used after restarting the acquisition listener when MS-DIAL got stuck processing a corrupted file. Now that MS-DIAL runs in the background, it is deprecated and should be removed.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def store_pid(instrument_id, run_id, pid):
3824def store_pid(instrument_id, run_id, pid):
3825
3826    """
3827    Stores acquisition listener subprocess ID to allow for checkup and termination.
3828
3829    Args:
3830        instrument_id (str):
3831            Instrument ID
3832        run_id (str):
3833            Instrument run ID (job ID)
3834        pid (str):
3835            Process ID for acquisition listener subprocess
3836
3837    Returns:
3838        None
3839    """
3840
3841    db_metadata, connection = connect_to_database(instrument_id)
3842    instrument_runs_table = sa.Table("runs", db_metadata, autoload=True)
3843
3844    update_pid = (
3845        sa.update(instrument_runs_table)
3846            .where(instrument_runs_table.c.run_id == run_id)
3847            .values(pid=pid)
3848    )
3849
3850    connection.execute(update_pid)
3851    connection.close()

Stores acquisition listener subprocess ID to allow for checkup and termination.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
  • pid (str): Process ID for acquisition listener subprocess
Returns:

None

def get_pid(instrument_id, run_id):
3854def get_pid(instrument_id, run_id):
3855
3856    """
3857    Retrieves acquisition listener process ID from "runs" table in Settings database.
3858
3859    Args:
3860        instrument_id (str):
3861            Instrument ID
3862        run_id (str):
3863            Instrument run ID (job ID)
3864
3865    Returns:
3866        None
3867    """
3868
3869    try:
3870        return get_instrument_run(instrument_id, run_id)["pid"].astype(int).tolist()[0]
3871    except:
3872        return None

Retrieves acquisition listener process ID from "runs" table in Settings database.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def upload_to_google_drive(file_dict):
3875def upload_to_google_drive(file_dict):
3876
3877    """
3878    Uploads files to MS-AutoQC folder in Google Drive.
3879
3880    Args:
3881        file_dict (dict):
3882            Dictionary with key-value structure { filename : file path }
3883
3884    Returns:
3885        dict: Dictionary with key-value structure { filename : Google Drive ID }
3886    """
3887
3888    # Get Google Drive instance
3889    drive = get_drive_instance()
3890
3891    # Get Google Drive ID for the MS-AutoQC folder
3892    folder_id = get_drive_folder_id()
3893
3894    # Store Drive ID's of uploaded file(s)
3895    drive_ids = {}
3896
3897    # Validate Google Drive folder ID
3898    if folder_id is not None:
3899        if folder_id != "None" and folder_id != "":
3900
3901            # Upload each file to Google Drive
3902            for filename in file_dict.keys():
3903                if os.path.exists(file_dict[filename]):
3904                    metadata = {
3905                        "title": filename,
3906                        "parents": [{"id": folder_id}],
3907                    }
3908                    file = drive.CreateFile(metadata=metadata)
3909                    file.SetContentFile(file_dict[filename])
3910                    file.Upload()
3911
3912                    drive_ids[file["title"]] = file["id"]
3913
3914    return drive_ids

Uploads files to MS-AutoQC folder in Google Drive.

Arguments:
  • file_dict (dict): Dictionary with key-value structure { filename : file path }
Returns:

dict: Dictionary with key-value structure { filename : Google Drive ID }

def upload_qc_results(instrument_id, run_id):
3917def upload_qc_results(instrument_id, run_id):
3918    
3919    """
3920    Uploads QC results for a given instrument run to Google Drive as CSV files.
3921
3922    Args:
3923        instrument_id (str):
3924            Instrument ID
3925        run_id (str):
3926            Instrument run ID (job ID)
3927
3928    Returns:
3929        None
3930    """
3931
3932    id = instrument_id.replace(" ", "_") + "_" + run_id
3933
3934    # Get Google Drive instance
3935    drive = get_drive_instance()
3936
3937    # Define file names and file paths
3938    run_filename = "run.csv"
3939    samples_csv_filename = "samples.csv"
3940    bio_standards_csv_filename = "bio_standards.csv"
3941
3942    run_directory = os.path.join(data_directory, id)
3943    if not os.path.exists(run_directory):
3944        os.makedirs(run_directory)
3945
3946    csv_directory = os.path.join(run_directory, "csv")
3947    if not os.path.exists(csv_directory):
3948        os.makedirs(csv_directory)
3949
3950    run_csv_path = os.path.join(csv_directory, run_filename)
3951    samples_csv_path = os.path.join(csv_directory, samples_csv_filename)
3952    bio_standards_csv_path = os.path.join(csv_directory, bio_standards_csv_filename)
3953
3954    # Convert sample and biological standard QC results from database into CSV files
3955    df_run = get_instrument_run(instrument_id, run_id)
3956    df_run.to_csv(run_csv_path, index=False)
3957
3958    df_samples = get_samples_in_run(instrument_id=instrument_id, run_id=run_id, sample_type="Sample")
3959    if len(df_samples) > 0:
3960        df_samples.to_csv(samples_csv_path, index=False)
3961
3962    df_bio_standards = get_table(instrument_id, "bio_qc_results")
3963    if len(df_bio_standards) > 0:
3964        df_bio_standards.to_csv(bio_standards_csv_path, index=False)
3965
3966    # Compress CSV files into a ZIP archive for faster upload
3967    zip_filename = id + ".zip"
3968    zip_file_path = zip_csv_files(
3969        input_directory=csv_directory, output_directory_and_name=os.path.join(run_directory, id))
3970
3971    zip_file = {zip_filename: zip_file_path}
3972
3973    # Get Google Drive ID for the CSV files ZIP archive
3974    zip_file_drive_id = get_instrument_run(instrument_id, run_id)["drive_id"].tolist()[0]
3975
3976    # Update existing ZIP archive in Google Drive
3977    if zip_file_drive_id is not None:
3978
3979        file = drive.CreateFile({
3980            "id": zip_file_drive_id,
3981            "title": zip_filename,
3982        })
3983
3984        # Execute upload
3985        file.SetContentFile(zip_file_path)
3986        file.Upload()
3987
3988    # If zip file Drive ID does not exist,
3989    else:
3990
3991        # Upload CSV files ZIP archive to Google Drive for first time
3992        drive_id = upload_to_google_drive(zip_file)[zip_filename]
3993
3994        # Store Drive ID of ZIP file in local database
3995        db_metadata, connection = connect_to_database(instrument_id)
3996        runs_table = sa.Table("runs", db_metadata, autoload=True)
3997
3998        connection.execute((
3999            sa.update(runs_table)
4000                .where(runs_table.c.run_id == run_id)
4001                .values(drive_id=drive_id)
4002        ))
4003
4004        connection.close()

Uploads QC results for a given instrument run to Google Drive as CSV files.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def download_qc_results(instrument_id, run_id):
4007def download_qc_results(instrument_id, run_id):
4008
4009    """
4010    Downloads CSV files of QC results from Google Drive and stores in /data directory.
4011
4012    Args:
4013        instrument_id (str):
4014            Instrument ID
4015        run_id (str):
4016            Instrument run ID (job ID)
4017
4018    Returns:
4019        tuple: Paths of run.csv, samples.csv, and bio_standards.csv, respectively.
4020    """
4021
4022    id = instrument_id.replace(" ", "_") + "_" + run_id
4023
4024    # Get Google Drive instance
4025    drive = get_drive_instance()
4026
4027    # Initialize directories
4028    run_directory = os.path.join(data_directory, id)
4029    if not os.path.exists(run_directory):
4030        os.makedirs(run_directory)
4031
4032    csv_directory = os.path.join(run_directory, "csv")
4033    if not os.path.exists(csv_directory):
4034        os.makedirs(csv_directory)
4035
4036    # Zip file
4037    zip_filename = id + ".zip"
4038    zip_file_path = os.path.join(run_directory, zip_filename)
4039
4040    # Get Google Drive folder ID
4041    gdrive_folder_id = get_drive_folder_id()
4042
4043    # Find and download ZIP archive of CSV files from Google Drive
4044    for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4045        if file["title"] == zip_filename:
4046            os.chdir(run_directory)
4047            file.GetContentFile(file["title"])
4048            os.chdir(root_directory)
4049            break
4050
4051    # Unzip archive
4052    unzip_csv_files(zip_file_path, csv_directory)
4053
4054    # Define and return file paths
4055    run_csv = os.path.join(csv_directory, "run.csv")
4056    samples_csv = os.path.join(csv_directory, "samples.csv")
4057    bio_standards_csv_file = os.path.join(csv_directory, "bio_standards.csv")
4058
4059    return (run_csv, samples_csv, bio_standards_csv_file)

Downloads CSV files of QC results from Google Drive and stores in /data directory.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

tuple: Paths of run.csv, samples.csv, and bio_standards.csv, respectively.

def get_drive_folder_id():
4062def get_drive_folder_id():
4063
4064    """
4065    Returns Google Drive ID for the MS-AutoQC folder (found in user's root Drive directory).
4066    """
4067
4068    return get_table("Settings", "workspace")["gdrive_folder_id"].values[0]

Returns Google Drive ID for the MS-AutoQC folder (found in user's root Drive directory).

def get_database_drive_id(instrument_id):
4071def get_database_drive_id(instrument_id):
4072
4073    """
4074    Returns Google Drive ID for a given instrument's database.
4075
4076    Args:
4077        instrument_id (str): Instrument ID
4078
4079    Returns:
4080        str: Google Drive ID for the instrument database ZIP archive.
4081    """
4082
4083    df = get_table("Settings", "instruments")
4084    return df.loc[df["name"] == instrument_id]["drive_id"].values[0]

Returns Google Drive ID for a given instrument's database.

Arguments:
  • instrument_id (str): Instrument ID
Returns:

str: Google Drive ID for the instrument database ZIP archive.

def upload_database(instrument_id, sync_settings=False):
4087def upload_database(instrument_id, sync_settings=False):
4088
4089    """
4090    Uploads database file and methods directory to Google Drive as ZIP archives.
4091
4092    Args:
4093        instrument_id (str):
4094            Instrument ID for the instrument database to upload
4095        sync_settings (bool, default False):
4096            Whether to upload methods directory as well
4097
4098    Returns:
4099        str: Timestamp upon upload completion.
4100    """
4101
4102    # Get Google Drive ID's for the MS-AutoQC folder and database file
4103    gdrive_folder_id = get_drive_folder_id()
4104    instrument_db_file_id = get_database_drive_id(instrument_id)
4105
4106    # Get Google Drive instance
4107    drive = get_drive_instance()
4108
4109    # Vacuum database to optimize size
4110    execute_vacuum(instrument_id)
4111
4112    # Upload methods directory to Google Drive
4113    if sync_settings == True:
4114        upload_methods()
4115
4116    # Upload database to Google Drive
4117    if gdrive_folder_id is not None and instrument_db_file_id is not None:
4118
4119        # Upload zipped database
4120        zip_database(instrument_id=instrument_id)
4121        file = drive.CreateFile(
4122            {"id": instrument_db_file_id, "title": instrument_id.replace(" ", "_") + ".zip"})
4123        file.SetContentFile(get_database_file(instrument_id, zip=True))
4124        file.Upload()
4125
4126        # Save modifiedDate of database file
4127        remember_last_modified(database=instrument_id, modified_date=file["modifiedDate"])
4128
4129    else:
4130        return None
4131
4132    return time.strftime("%H:%M:%S")

Uploads database file and methods directory to Google Drive as ZIP archives.

Arguments:
  • instrument_id (str): Instrument ID for the instrument database to upload
  • sync_settings (bool, default False): Whether to upload methods directory as well
Returns:

str: Timestamp upon upload completion.

def download_database(instrument_id, sync_settings=False):
4135def download_database(instrument_id, sync_settings=False):
4136
4137    """
4138    Downloads instrument database ZIP file from Google Drive.
4139
4140    This function is called when accessing an instrument database from a device other than the given instrument.
4141
4142    Args:
4143        instrument_id (str):
4144            Instrument ID for the instrument database to download
4145        sync_settings (bool, default False):
4146            Whether to download methods directory as well
4147
4148    Returns:
4149        str: Timestamp upon download completion.
4150    """
4151
4152    db_zip_file = instrument_id.replace(" ", "_") + ".zip"
4153
4154    # If the database was not modified by another instrument, skip download (for instruments only)
4155    if not database_was_modified(instrument_id):
4156        return None
4157
4158    # Get Google Drive instance
4159    drive = get_drive_instance()
4160
4161    # Get Google Drive ID's for the MS-AutoQC folder and database file
4162    gdrive_folder_id = get_drive_folder_id()
4163    instrument_db_file_id = get_instrument(instrument_id)["drive_id"].values[0]
4164
4165    # If Google Drive folder is found, look for database next
4166    if gdrive_folder_id is not None and instrument_db_file_id is not None:
4167
4168        # Download newly added / modified MSP files in MS-AutoQC > methods
4169        if sync_settings == True:
4170            download_methods(skip_check=True)
4171
4172        try:
4173            for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4174                if file["title"] == db_zip_file:
4175
4176                    # Download and unzip database
4177                    os.chdir(data_directory)                        # Change to data directory
4178                    file.GetContentFile(file["title"])              # Download database and get file ID
4179                    os.chdir(root_directory)                        # Return to root directory
4180                    unzip_database(instrument_id=instrument_id)     # Unzip database
4181
4182                    # Save modifiedDate of database file
4183                    remember_last_modified(database=instrument_id, modified_date=file["modifiedDate"])
4184
4185        except Exception as error:
4186            print("Error downloading database from Google Drive:", error)
4187            return None
4188    else:
4189        return None
4190
4191    return time.strftime("%H:%M:%S")

Downloads instrument database ZIP file from Google Drive.

This function is called when accessing an instrument database from a device other than the given instrument.

Arguments:
  • instrument_id (str): Instrument ID for the instrument database to download
  • sync_settings (bool, default False): Whether to download methods directory as well
Returns:

str: Timestamp upon download completion.

def upload_methods():
4194def upload_methods():
4195
4196    """
4197    Uploads methods directory ZIP archive to Google Drive.
4198    """
4199
4200    df_workspace = get_table("Settings", "workspace")
4201    methods_zip_file_id = df_workspace["methods_zip_file_id"].values[0]
4202
4203    # Vacuum database to optimize size
4204    execute_vacuum("Settings")
4205
4206    # Get Google Drive instance
4207    drive = get_drive_instance()
4208
4209    # Upload methods ZIP archive to Google Drive
4210    if methods_zip_file_id is not None:
4211
4212        # Upload zipped database
4213        methods_zip_file = zip_methods()
4214        file = drive.CreateFile({"id": methods_zip_file_id, "title": "methods.zip"})
4215        file.SetContentFile(methods_zip_file)
4216        file.Upload()
4217
4218        # Save modifiedDate of methods ZIP file
4219        remember_last_modified(database="Settings", modified_date=file["modifiedDate"])
4220
4221    else:
4222        return None

Uploads methods directory ZIP archive to Google Drive.

def download_methods(skip_check=False):
4225def download_methods(skip_check=False):
4226
4227    """
4228    Downloads methods directory ZIP archive from Google Drive.
4229
4230    Args:
4231        skip_check (bool, default False): If True, skips checking whether database was modified
4232
4233    Returns:
4234        None
4235    """
4236
4237    # If the database was not modified by another instrument, skip download (for instruments only)
4238    if not skip_check:
4239        if not database_was_modified("Settings"):
4240            return None
4241
4242    # Get device identity
4243    instrument_bool = is_instrument_computer()
4244    device_identity = get_device_identity()
4245
4246    # Get MS-DIAL directory
4247    try:
4248        msdial_directory = get_msdial_directory()
4249    except:
4250        msdial_directory = None
4251
4252    # Get Google Drive instance
4253    drive = get_drive_instance()
4254
4255    # Get Google Drive folder ID
4256    gdrive_folder_id = get_drive_folder_id()
4257
4258    try:
4259        # Download and unzip methods directory
4260        for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4261            if file["title"] == "methods.zip":
4262                os.chdir(data_directory)                # Change to data directory
4263                file.GetContentFile(file["title"])      # Download methods ZIP archive
4264                os.chdir(root_directory)                # Return to root directory
4265                unzip_methods()                         # Unzip methods directory
4266
4267                # Save modifiedDate of methods directory
4268                remember_last_modified(database="Settings", modified_date=file["modifiedDate"])
4269
4270    except Exception as error:
4271        print("Error downloading methods from Google Drive:", error)
4272        return None
4273
4274    # Update MS-DIAL directory
4275    update_msdial_directory(msdial_directory)
4276
4277    # Update user device identity
4278    set_device_identity(is_instrument_computer=instrument_bool, instrument_id=device_identity)
4279    return time.strftime("%H:%M:%S")

Downloads methods directory ZIP archive from Google Drive.

Arguments:
  • skip_check (bool, default False): If True, skips checking whether database was modified
Returns:

None

def remember_last_modified(database, modified_date):
4282def remember_last_modified(database, modified_date):
4283
4284    """
4285    Stores last modified time of database file in Google Drive.
4286
4287    This function is called after file upload, and used for comparison before download.
4288
4289    Args:
4290        database (str):
4291            Name of database (either Instrument ID or "Settings")
4292        modified_date (str):
4293            Modified date of file uploaded to Google Drive
4294
4295    Returns:
4296        None
4297    """
4298
4299    db_metadata, connection = connect_to_database("Settings")
4300    instruments_table = sa.Table("instruments", db_metadata, autoload=True)
4301    workspace_table = sa.Table("workspace", db_metadata, autoload=True)
4302
4303    if database == "Settings":
4304        connection.execute((
4305            sa.update(workspace_table)
4306                .where((workspace_table.c.id == 1))
4307                .values(methods_last_modified=modified_date)
4308        ))
4309    else:
4310        connection.execute((
4311            sa.update(instruments_table)
4312            .where((instruments_table.c.name == database))
4313            .values(last_modified=modified_date)
4314        ))
4315
4316    connection.close()

Stores last modified time of database file in Google Drive.

This function is called after file upload, and used for comparison before download.

Arguments:
  • database (str): Name of database (either Instrument ID or "Settings")
  • modified_date (str): Modified date of file uploaded to Google Drive
Returns:

None

def database_was_modified(database_name):
4319def database_was_modified(database_name):
4320
4321    """
4322    Returns True if workspace file was modified by another instrument PC in Google Drive, and False if not.
4323
4324    Args:
4325        database_name (str): Name of database
4326
4327    Returns:
4328        Returns True if workspace file was modified by another instrument PC in Google Drive, and False if not.
4329    """
4330
4331    # Get Google Drive folder ID from database
4332    gdrive_folder_id = get_drive_folder_id()
4333
4334    # Compare "last modified" values
4335    if database_name == "Settings":
4336        local_last_modified = get_table("Settings", "workspace")["methods_last_modified"].values[0]
4337        filename = "methods.zip"
4338    else:
4339        local_last_modified = get_instrument(database_name)["last_modified"].values[0]
4340        filename = database_name.replace(" ", "_") + ".zip"
4341
4342    # Get Google Drive instance
4343    drive = get_drive_instance()
4344
4345    drive_last_modified = None
4346    for file in drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList():
4347        if file["title"] == filename:
4348            drive_last_modified = file["modifiedDate"]
4349            break
4350
4351    if local_last_modified == drive_last_modified:
4352        return False
4353    else:
4354        return True

Returns True if workspace file was modified by another instrument PC in Google Drive, and False if not.

Arguments:
  • database_name (str): Name of database
Returns:

Returns True if workspace file was modified by another instrument PC in Google Drive, and False if not.

def send_sync_signal(folder_id):
4357def send_sync_signal(folder_id):
4358
4359    """
4360    Uploads empty file to signal that an instrument PC is syncing to Google Drive.
4361
4362    TODO: This method is deprecated. Please remove if no plans for usage.
4363
4364    Args:
4365        folder_id (str): Google Drive folder ID
4366
4367    Returns:
4368        bool: True if sync signal was sent, False if not.
4369    """
4370
4371    # Get Google Drive instance
4372    drive = get_drive_instance()
4373
4374    try:
4375        drive.CreateFile(metadata={"title": "Syncing", "parents": [{"id": folder_id}]}).Upload()
4376        return True
4377    except:
4378        return False

Uploads empty file to signal that an instrument PC is syncing to Google Drive.

TODO: This method is deprecated. Please remove if no plans for usage.

Arguments:
  • folder_id (str): Google Drive folder ID
Returns:

bool: True if sync signal was sent, False if not.

def safe_to_upload(folder_id):
4381def safe_to_upload(folder_id):
4382
4383    """
4384    Returns False if another device is currently uploading to Google Drive, else True.
4385
4386    TODO: This method is deprecated. Please remove if no plans for usage.
4387
4388    Args:
4389        folder_id (str): Google Drive folder ID
4390
4391    Returns:
4392        bool: False if another device is currently uploading to Google Drive, True if not.
4393    """
4394
4395    # Get Google Drive instance
4396    drive = get_drive_instance()
4397
4398    for file in drive.ListFile({"q": "'" + folder_id + "' in parents and trashed=false"}).GetList():
4399        if file["title"] == "Syncing":
4400            return False
4401
4402    return True

Returns False if another device is currently uploading to Google Drive, else True.

TODO: This method is deprecated. Please remove if no plans for usage.

Arguments:
  • folder_id (str): Google Drive folder ID
Returns:

bool: False if another device is currently uploading to Google Drive, True if not.

def remove_sync_signal(folder_id):
4405def remove_sync_signal(folder_id):
4406
4407    """
4408    Removes empty signal file to signal that an instrument PC has completed syncing to Google Drive.
4409
4410    TODO: This method is deprecated. Please remove if no plans for usage.
4411
4412    Args:
4413        folder_id (str): Google Drive folder ID
4414
4415    Returns:
4416        bool: True if sync signal was removed, False if not.
4417    """
4418
4419    # Get Google Drive instance
4420    drive = get_drive_instance()
4421
4422    try:
4423        for file in drive.ListFile({"q": "'" + folder_id + "' in parents and trashed=false"}).GetList():
4424            if file["title"] == "Syncing":
4425                file.Delete()
4426        return True
4427    except:
4428        return False

Removes empty signal file to signal that an instrument PC has completed syncing to Google Drive.

TODO: This method is deprecated. Please remove if no plans for usage.

Arguments:
  • folder_id (str): Google Drive folder ID
Returns:

bool: True if sync signal was removed, False if not.

def delete_active_run_csv_files(instrument_id, run_id):
4431def delete_active_run_csv_files(instrument_id, run_id):
4432
4433    """
4434    Checks for and deletes CSV files from Google Drive at the end of an active instrument run.
4435
4436    Args:
4437        instrument_id (str):
4438            Instrument ID
4439        run_id (str):
4440            Instrument run ID (job ID)
4441
4442    Returns:
4443        None
4444    """
4445
4446    id = instrument_id.replace(" ", "_") + "_" + run_id
4447
4448    # Find zip archive of CSV files in Google Drive and delete it
4449    drive = get_drive_instance()
4450    gdrive_folder_id = get_drive_folder_id()
4451
4452    if gdrive_folder_id is not None:
4453        drive_file_list = drive.ListFile({"q": "'" + gdrive_folder_id + "' in parents and trashed=false"}).GetList()
4454        for file in drive_file_list:
4455            if file["title"] == id + ".zip":
4456                file.Delete()
4457                break
4458
4459    # Delete Drive ID from database
4460    db_metadata, connection = connect_to_database(instrument_id)
4461    runs_table = sa.Table("runs", db_metadata, autoload=True)
4462
4463    connection.execute((
4464        sa.update(runs_table)
4465            .where(runs_table.c.run_id == run_id)
4466            .values(drive_id=None)
4467    ))
4468
4469    connection.close()

Checks for and deletes CSV files from Google Drive at the end of an active instrument run.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def sync_on_run_completion(instrument_id, run_id):
4472def sync_on_run_completion(instrument_id, run_id):
4473
4474    """
4475    Syncs database with Google Drive at the end of an active instrument run.
4476
4477    Performs the following actions:
4478        1. Upload database to Google Drive
4479        2. Delete active run CSV files
4480
4481    Args:
4482        instrument_id (str):
4483            Instrument ID
4484        run_id (str):
4485            Instrument run ID (job ID)
4486
4487    Returns:
4488        None
4489    """
4490
4491    # Get Google Drive instance and folder ID
4492    drive = get_drive_instance()
4493    gdrive_folder_id = get_drive_folder_id()
4494
4495    # Upload database to Google Drive
4496    try:
4497        upload_database(instrument_id)
4498    except Exception as error:
4499        print("sync_on_run_completion() – Error uploading database during sync", error)
4500        return None
4501
4502    # Delete active run CSV files
4503    try:
4504        delete_active_run_csv_files(instrument_id, run_id)
4505    except Exception as error:
4506        print("sync_on_run_completion() – Error deleting CSV files after sync", error)
4507        return None

Syncs database with Google Drive at the end of an active instrument run.

Performs the following actions:
  1. Upload database to Google Drive
  2. Delete active run CSV files
Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def get_data_file_type(instrument_id):
4510def get_data_file_type(instrument_id):
4511
4512    """
4513    Returns expected data file extension based on instrument vendor type.
4514
4515    TODO: Modify this function as needed when adding support for other instrument vendors.
4516
4517    Args:
4518        instrument_id (str): Instrument ID
4519
4520    Returns:
4521        Data file extension for instrument vendor.
4522    """
4523
4524    engine = sa.create_engine(settings_database)
4525    df_instruments = pd.read_sql("SELECT * FROM instruments WHERE name='" + instrument_id + "'", engine)
4526    vendor = df_instruments["vendor"].astype(str).tolist()[0]
4527
4528    if vendor == "Thermo Fisher":
4529        return "raw"
4530    elif vendor == "Agilent":
4531        return "d"
4532    elif vendor == "Bruker":
4533        return "baf"
4534    elif vendor == "Waters":
4535        return "raw"
4536    elif vendor == "Sciex":
4537        return "wiff2"

Returns expected data file extension based on instrument vendor type.

TODO: Modify this function as needed when adding support for other instrument vendors.

Arguments:
  • instrument_id (str): Instrument ID
Returns:

Data file extension for instrument vendor.

def is_completed_run(instrument_id, run_id):
4540def is_completed_run(instrument_id, run_id):
4541
4542    """
4543    Returns True if the given QC job is for a completed run, and False if for an active run.
4544
4545    Args:
4546        instrument_id (str):
4547            Instrument ID
4548        run_id (str):
4549            Instrument run ID (job ID)
4550
4551    Returns:
4552        bool: True if the job is for a completed run, and False if job is for an active run.
4553    """
4554
4555    try:
4556        job_type = get_instrument_run(instrument_id, run_id)["job_type"].astype(str).values[0]
4557        if job_type == "completed":
4558            return True
4559        else:
4560            return False
4561    except:
4562        print("Could not get MS-AutoQC job type.")
4563        traceback.print_exc()
4564        return False

Returns True if the given QC job is for a completed run, and False if for an active run.

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

bool: True if the job is for a completed run, and False if job is for an active run.

def delete_temp_directory(instrument_id, run_id):
4567def delete_temp_directory(instrument_id, run_id):
4568
4569    """
4570    Deletes temporary data file directory in local app directory.
4571
4572    This function is called at the end of an instrument run (QC job).
4573
4574    Args:
4575        instrument_id (str):
4576            Instrument ID
4577        run_id (str):
4578            Instrument run ID (job ID)
4579
4580    Returns:
4581        None
4582    """
4583
4584    # Delete temporary data file directory
4585    try:
4586        id = instrument_id.replace(" ", "_") + "_" + run_id
4587        temp_directory = os.path.join(data_directory, id)
4588        if os.path.exists(temp_directory):
4589            shutil.rmtree(temp_directory)
4590    except:
4591        print("Could not delete temporary data directory.")

Deletes temporary data file directory in local app directory.

This function is called at the end of an instrument run (QC job).

Arguments:
  • instrument_id (str): Instrument ID
  • run_id (str): Instrument run ID (job ID)
Returns:

None

def pipeline_valid(module=None):
4594def pipeline_valid(module=None):
4595
4596    """
4597    Validates that MSConvert and MS-DIAL dependencies are installed.
4598
4599    This function is called during job setup validation.
4600
4601    Args:
4602        module (str, default None): If specified, only validates given module.
4603
4604    Returns:
4605        bool: Whether MSConvert.exe and MsdialConsoleApp.exe exist.
4606    """
4607
4608    try:
4609        msconvert_installed = os.path.exists(os.path.join(get_msconvert_directory(), "msconvert.exe"))
4610    except:
4611        msconvert_installed = False
4612
4613    try:
4614        msdial_installed = os.path.exists(os.path.join(get_msdial_directory(), "MsdialConsoleApp.exe"))
4615    except:
4616        msdial_installed = False
4617
4618    if module == "msdial":
4619        return msdial_installed
4620    elif module == "msconvert":
4621        return msconvert_installed
4622    else:
4623        return msconvert_installed and msdial_installed

Validates that MSConvert and MS-DIAL dependencies are installed.

This function is called during job setup validation.

Arguments:
  • module (str, default None): If specified, only validates given module.
Returns:

bool: Whether MSConvert.exe and MsdialConsoleApp.exe exist.

def send_email(subject, message_body):
4626def send_email(subject, message_body):
4627
4628    """
4629    Sends email using Google authenticated credentials.
4630
4631    This function is called for QC warnings and fails if:
4632        1. Google Drive sync is enabled
4633        2. Email addresses are registered for notifications
4634
4635    Args:
4636        subject (str):
4637            Subject of email
4638        message_body (str):
4639            Body of email
4640
4641    Returns:
4642        On success, an email.message.EmailMessage object.
4643    """
4644
4645    try:
4646        credentials = google_auth.load_credentials_from_file(alt_credentials)[0]
4647
4648        service = build("gmail", "v1", credentials=credentials)
4649        message = EmailMessage()
4650
4651        message.set_content(message_body)
4652
4653        message["Subject"] = subject
4654        message["To"] = get_email_notifications_list(as_string=True)
4655
4656        encoded_message = base64.urlsafe_b64encode(message.as_bytes()).decode()
4657        create_message = { "raw": encoded_message }
4658
4659        send_message = (service.users().messages().send(userId="me", body=create_message).execute())
4660
4661    except Exception as error:
4662        traceback.print_exc()
4663        send_message = None
4664
4665    return send_message

Sends email using Google authenticated credentials.

This function is called for QC warnings and fails if:
  1. Google Drive sync is enabled
  2. Email addresses are registered for notifications
Arguments:
  • subject (str): Subject of email
  • message_body (str): Body of email
Returns:

On success, an email.message.EmailMessage object.