Files API

Complete guide to working with files using the sync client library.

Overview

The Files API provides methods to upload, download, list, update, and delete files attached to records. Access it via client.files.


from nrp_cmd.sync_client import get_sync_client
 
client = get_sync_client("https://your-repository.org")
files_client = client.files

Listing Files

List Files on a Record


# List all files on a draft record
record = client.records.draft_records.read("abc-123")
files = client.files.list(record)
 
for file in files:
    print(f"File: {file.key}")
    print(f"  Size: {file.size} bytes")
    print(f"  Checksum: {file.checksum}")
    print(f"  Metadata: {file.metadata}")

List by Record URL


from yarl import URL
 
# List files using record URL
record_url = URL("https://repository.org/api/records/abc-123")
files = client.files.list(record_url)

Access File Properties


files = client.files.list(record)
 
for file in files:
    # Basic properties
    print(f"Key: {file.key}")
    print(f"Size: {file.size}")
    print(f"MIME type: {file.mimetype}")
    print(f"Checksum: {file.checksum}")
    
    # Metadata
    print(f"Description: {file.metadata.get('description', 'N/A')}")
    
    # Status
    print(f"Status: {file.status}")  # e.g., 'completed'
    
    # Links
    print(f"Download URL: {file.links.content}")
    print(f"Self URL: {file.links.self_}")

Uploading Files

Basic Upload from File Path


# Upload a file from filesystem
file = client.files.upload(
    record,
    key="data.csv",
    metadata={"description": "Dataset file"},
    source="/path/to/data.csv"
)
 
print(f"Uploaded: {file.key} ({file.size} bytes)")

Upload with Progress Bar


# Upload with progress tracking
file = client.files.upload(
    record,
    key="large-file.zip",
    metadata={"description": "Large dataset archive"},
    source="/path/to/large-file.zip",
    progress="Uploading large file"  # Shows progress bar
)

Upload from Path Object


from pathlib import Path
 
# Upload using Path
data_file = Path("data/experiment_results.csv")
file = client.files.upload(
    record,
    key=data_file.name,
    metadata={"description": "Experiment results"},
    source=data_file
)

Upload with Data Stream


from nrp_cmd.sync_client.streams import FileSource
 
# Upload using FileSource for more control
source = FileSource("/path/to/file.pdf")
file = client.files.upload(
    record,
    key="document.pdf",
    metadata={"description": "Research paper"},
    source=source
)

Upload from Memory


from nrp_cmd.sync_client.streams import MemorySource
 
# Upload from bytes in memory
data = b"This is my file content"
source = MemorySource(data)
 
file = client.files.upload(
    record,
    key="textfile.txt",
    metadata={"description": "Text file"},
    source=source
)

Upload Multiple Files


# Upload multiple files to a record
files_to_upload = [
    ("data.csv", "/path/to/data.csv", "Raw data"),
    ("analysis.py", "/path/to/analysis.py", "Analysis script"),
    ("results.pdf", "/path/to/results.pdf", "Results report"),
]
 
uploaded_files = []
for key, path, description in files_to_upload:
    file = client.files.upload(
        record,
        key=key,
        metadata={"description": description},
        source=path,
        progress=f"Uploading {key}"
    )
    uploaded_files.append(file)
    print(f"✓ Uploaded {key}")

Upload with Transfer Type

Uploading Large Files with Multipart Transfer

To upload large files, you need to indicate the client to use a multipart transfer type. If you do so, the payload will be split into multiple parts and those will be uploaded in parallel, which can significantly speed up the upload process as well as make it more resilient to network issues.

The source for uploading must be capable of seeking, so it can be split into parts. If FileSource or MemorySource is used, this is supported out of the box. If you implement your own source, make sure to make it seekable.


file = client.files.upload(
    record,
    key="large-dataset.zip",
    metadata={"description": "Data from URL"},
    source=<path>,
    transfer_type="M"
)

ℹ️

As a rule of thumb, use the multipart upload for files larger than 10MB, which should be supported by most repositories. For smaller files, the standard upload method is usually more efficient.

Importing Files from URL

Another example is using a fetch transfer type, which allows the repository to fetch the file directly from a URL. This is useful when the file is already hosted somewhere and you want to avoid uploading it through your client.


# Upload via URL fetch (if supported by repository)
file = client.files.upload(
    record,
    key="remote-data.csv",
    metadata={"description": "Data from URL"},
    source="https://example.com/data.csv",
    transfer_type="F"
)

ℹ️

A repository must be configured to allow users to upload files using this method. Please check with your repository administrator if you want to use this feature.

Downloading Files

Download to File Path


# Download file to local filesystem
file = client.files.read(record, "data.csv")
client.files.download(
    file,
    "/path/to/save/data.csv"
)

Download with Progress Bar


# Download with progress tracking
client.files.download(
    file,
    "/path/to/save/large-file.zip",
    progress="Downloading large file"
)

Download Using File Object


# Get file info first, then download
files = client.files.list(record)
file = files[0]  # Get first file
 
client.files.download(
    file,
    f"/downloads/{file.key}",
    progress=f"Downloading {file.key}"
)

Download Using File URL


from yarl import URL
 
# Download directly from file URL
file_url = URL("https://repository.org/api/records/abc-123/files/data.csv")
client.files.download(
    file_url,
    "/path/to/save/data.csv"
)

Download with DataSink


from nrp_cmd.sync_client.streams import FileSink
 
# Download using FileSink for more control
sink = FileSink("/path/to/output.pdf")
client.files.download(file, sink)

Download to Memory


from nrp_cmd.sync_client.streams import MemorySink
 
# Download to memory
sink = MemorySink()
client.files.download(file, sink)
 
# Access the data
data = sink.data
print(f"Downloaded {len(data)} bytes")

Download Multiple Files


# Download all files from a record (can be draft or published)
record = client.records.draft_records.read("abc-123")
files = client.files.list(record)
 
for file in files:
    output_path = f"/downloads/{record.id}/{file.key}"
    client.files.download(
        file,
        output_path,
        progress=f"Downloading {file.key}"
    )
    print(f"✓ Downloaded {file.key}")

Download Multiple Files Sequentially


# Download multiple files from a record
def download_file(file, output_dir):
    output_path = f"{output_dir}/{file.key}"
    client.files.download(file, output_path)
    return file.key
 
def download_all_files(record, output_dir):
    files = client.files.list(record)
    
    # Download each file
    results = []
    for file in files:
        result = download_file(file, output_dir)
        results.append(result)
    
    return results
 
# Use it
downloaded = download_all_files(record, "/downloads")
print(f"Downloaded {len(downloaded)} files")

Reading File Metadata

Read File Info


# Read file metadata without downloading content
file = client.files.read(record, "data.csv")
 
print(f"File key: {file.key}")
print(f"Size: {file.size}")
print(f"Checksum: {file.checksum}")
print(f"MIME type: {file.mimetype}")
print(f"Metadata: {file.metadata}")

Read by File URL


from yarl import URL
 
# Read file info from URL
file_url = URL("https://repository.org/api/records/abc-123/files/data.csv")
file = client.files.read(file_url)

Check File Existence


# Check if file exists on record
try:
    file = client.files.read(record, "myfile.pdf")
    print(f"File exists: {file.key}")
except Exception:
    print("File does not exist")

Updating File Metadata

Update File Metadata


# Read file, update metadata, save
file = client.files.read(record, "data.csv")
 
# Update metadata
file.metadata["description"] = "Updated description"
file.metadata["type"] = "dataset"
file.metadata["processing_date"] = "2024-01-15"
 
# Save changes
updated_file = client.files.update(file)
print(f"Updated metadata for: {updated_file.key}")

Add Metadata Fields


# Add new metadata fields (must be supported by schema on server)
file = client.files.read(record, "image.jpg")
 
file.metadata["photographer"] = "John Doe"
file.metadata["location"] = "Research Site A"
file.metadata["date_taken"] = "2024-01-15"
file.metadata["camera"] = "Canon EOS R5"
 
updated = client.files.update(file)

Bulk Metadata Update


# Update metadata for multiple files
def update_all_file_metadata(record, new_metadata):
    files = client.files.list(record)
    
    for file in files:
        # Merge new metadata
        file.metadata.update(new_metadata)
        client.files.update(file)
        print(f"✓ Updated {file.key}")
 
# Use it
update_all_file_metadata(
    record,
    {"license": "CC-BY-4.0", "processed": True}
)

Deleting Files

Delete by Record and Key


# Delete a file by key
client.files.delete(record, key="unwanted-file.txt")
print("File deleted")

Delete Using File Object


# Delete using file object
file = client.files.read(record, "old-data.csv")
client.files.delete(file)

Delete by File URL


from yarl import URL
 
# Delete using file URL
file_url = URL("https://repository.org/api/records/abc-123/files/temp.txt")
client.files.delete(file_url)

Delete All Files


# Delete all files from a record
files = client.files.list(record)
 
for file in files:
    client.files.delete(file)
    print(f"✓ Deleted {file.key}")

Conditional Delete


# Delete files matching condition
files = client.files.list(record)
 
for file in files:
    # Delete only temporary files
    if file.key.startswith("temp_"):
        client.files.delete(file)
        print(f"✓ Deleted temporary file: {file.key}")

Complete Workflow Examples

Upload Multiple Files and Publish


def create_record_with_files(title, file_paths):
    """Create record and upload multiple files."""
    # Create draft record
    draft = client.records.create({
        "metadata": {
            "title": title,
            "creators": [{"person_or_org": {"name": "Researcher"}}],
            "resource_type": {"id": "dataset"}
        }
    })
    
    # Upload each file
    for path in file_paths:
        filename = Path(path).name
        client.files.upload(
            draft,
            key=filename,
            metadata={"description": f"Data file: {filename}"},
            source=path,
            progress=f"Uploading {filename}"
        )
    
    # Publish the record
    published = client.records.publish(draft)
    return published
 
# Use it
published = create_record_with_files(
    "Research Dataset 2024",
    [
        "/data/experiment1.csv",
        "/data/experiment2.csv",
        "/data/README.txt"
    ]
)

Download Complete Record with Files


def download_record_with_files(record_id, output_dir):
    """Download record metadata and all files."""
    from pathlib import Path
    import json
    
    # Create output directory
    output_path = Path(output_dir) / record_id
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Get record (can be draft or published)
    record = client.records.draft_records.read(record_id)
    
    # Save metadata
    metadata_file = output_path / "metadata.json"
    with open(metadata_file, 'w') as f:
        json.dump(record.metadata, f, indent=2)
    
    # Download files
    files = client.files.list(record)
    files_dir = output_path / "files"
    files_dir.mkdir(exist_ok=True)
    
    for file in files:
        file_path = files_dir / file.key
        client.files.download(
            file,
            str(file_path),
            progress=f"Downloading {file.key}"
        )
        
        # Also save file metadata
        file_meta_path = files_dir / f"{file.key}.metadata.json"
        with open(file_meta_path, 'w') as f:
            json.dump({
                "key": file.key,
                "size": file.size,
                "checksum": file.checksum,
                "mimetype": file.mimetype,
                "metadata": file.metadata
            }, f, indent=2)
    
    return output_path
 
# Use it
saved_to = download_record_with_files(
    "abc-123",
    "/backups"
)
print(f"Downloaded to: {saved_to}")

Replace File with New Version


def replace_file(record, key, new_file_path):
    """Replace an existing file with a new version."""
    # Delete old file
    try:
        client.files.delete(record, key=key)
    except Exception:
        pass  # File might not exist
    
    # Upload new version
    file = client.files.upload(
        record,
        key=key,
        metadata={"description": "Updated version"},
        source=new_file_path,
        progress=f"Uploading {key}"
    )
    
    return file
 
# Use it
updated = replace_file(
    record,
    "data.csv",
    "/path/to/new_data.csv"
)

Sync Files from Directory


def sync_files_to_record(record, directory_path):
    """Upload all files from directory to record."""
    from pathlib import Path
    
    directory = Path(directory_path)
    uploaded = []
    
    for file_path in directory.iterdir():
        if file_path.is_file():
            file = client.files.upload(
                record,
                key=file_path.name,
                metadata={
                    "description": f"Synced from {directory.name}",
                    "original_path": str(file_path)
                },
                source=str(file_path),
                progress=f"Uploading {file_path.name}"
            )
            uploaded.append(file)
    
    return uploaded
 
# Use it
files = sync_files_to_record(
    draft_record,
    "/data/experiment_2024"
)
print(f"Uploaded {len(files)} files")

Copy Files Between Records


def copy_files_between_records(source_record_id, dest_record_id):
    """Copy all files from one record to another."""
    import tempfile
    from pathlib import Path
    
    # Can copy from/to draft or published records
    source_record = client.records.draft_records.read(source_record_id)
    dest_record = client.records.draft_records.read(dest_record_id)
    
    source_files = client.files.list(source_record)
    
    # Use temporary directory for intermediate storage
    with tempfile.TemporaryDirectory() as tmpdir:
        for file in source_files:
            # Download from source
            tmp_path = Path(tmpdir) / file.key
            client.files.download(file, str(tmp_path))
            
            # Upload to destination
            client.files.upload(
                dest_record,
                key=file.key,
                metadata=file.metadata.copy(),
                source=str(tmp_path),
                progress=f"Copying {file.key}"
            )
    
    return len(source_files)
 
# Use it
copied = copy_files_between_records("source-123", "dest-456")
print(f"Copied {copied} files")

Validate File Checksums


def validate_file_checksums(record):
    """Download files and verify their checksums."""
    import hashlib
    import tempfile
    from pathlib import Path
    
    files = client.files.list(record)
    results = []
    
    with tempfile.TemporaryDirectory() as tmpdir:
        for file in files:
            # Download file
            tmp_path = Path(tmpdir) / file.key
            client.files.download(file, str(tmp_path))
            
            # Calculate checksum
            hasher = hashlib.md5()
            with open(tmp_path, 'rb') as f:
                for chunk in iter(lambda: f.read(4096), b''):
                    hasher.update(chunk)
            
            calculated = f"md5:{hasher.hexdigest()}"
            expected = file.checksum
            
            results.append({
                "file": file.key,
                "valid": calculated == expected,
                "expected": expected,
                "calculated": calculated
            })
    
    return results
 
# Use it
validation = validate_file_checksums(record)
for result in validation:
    status = "✓" if result["valid"] else "✗"
    print(f"{status} {result['file']}: {result['valid']}")

Error Handling


from nrp_cmd.errors import (
    RepositoryCommunicationError,
    RepositoryClientError
)
 
try:
    file = client.files.upload(
        record,
        key="data.csv",
        metadata={},
        source="/path/to/file.csv"
    )
except FileNotFoundError:
    print("Source file not found")
except RepositoryClientError as e:
    print(f"Upload failed: {e}")
except RepositoryCommunicationError as e:
    print(f"Network error: {e}")

API Reference

Methods

list(record_or_url) - List all files on a record
read(file_url) / read(record, key) - Get file metadata
upload(record_or_url, key, metadata, source, transfer_type='local-file', transfer_metadata=None, progress=None) - Upload a file
download(file_or_url, sink, *, parts=None, part_size=None, progress=None) - Download a file
download(record, key, sink, *, parts=None, part_size=None, progress=None) - Download file by key
update(file) - Update file metadata
delete(record, key=None) / delete(file) / delete(file_url) - Delete a file

Stream Classes

FileSource(path) - Read from filesystem
FileSink(path) - Write to filesystem
MemorySource(data) - Read from memory
MemorySink() - Write to memory

Transfer Types

'local-file' - Standard file upload (default)
'url-fetch' - Fetch from URL (if supported)
'multipart' - Multipart upload (if supported)