ori-data-pipeline/Sakefile.yaml

#using sake instead of just because need file outputs for dependencies ...
#! P=process
#! I=inputs
#! O=output
#! C=csvsql --no-inference --snifflimit 0 --query
#


#skip for demo: demo dataset already has normalized names.
# txns-names:
#   help: normalize account names and fix missing names
#   (ignore) doc: |-
#     a  case statement with a when for each variant present in the input data,
#     becauase the payee field varies (e.g. 'Bob Fischer', 'B. Fischer').
#     Then also add missing names (some bank transactions don't have a payee)
#   (ignore) WARNING: |-
#     might need to check the input data over time, as new variants occur... Is there some kind of warning I can build in to see if there are any UNKNOWNS left over?
#     also, the second query, 'fix missing names', sets all remaining UNKNOWNs to Bank. Is it possible to do that so there's remaining UNKNOWNs to detect?
#   dependencies:
#     - "$I/bank-transactions.csv"
#     - "fix_missing_names.sql"
#     - "query_normalize_names.sql" #not present in this demo dataset
#   formula: >-
#     $C "normalize_names.sql"
#     --query "fix_missing_names.sql"
#     < "$I/bank-transactions.csv"
#     > $P/txns-names.csv
#   output:
#     - $P/txns-names.csv


txns-fromto:
  help: convert transactions to format 'from — amount — to'
  dependencies:
#    - $P/txns-names.csv
    - $I/bank-transactions.csv
  formula: >-
    $C 'select id, date, name as "from",
    amount,
    "shared" as "to", description
    from stdin where cast(amount as real) > 0
    union select id, date, "shared" as "from",
    -amount,
    name as "to", description
    from stdin where cast(amount as real) < 0
    order by id, date;'
    < $I/bank-transactions.csv
    > $P/txns-fromto.csv
  output:
    - $P/txns-fromto.csv


#skip for demo. In the real dataset, kWh-meters:users is many:one so the meter usages have to be grouped and summed.
#months added to output file manually.
# kwh-usage:
#   help: everyone's usage per period, including kWh used and months active in period.
#   (ignore) doc: |
#     in the real dataset, some households user multiple kWh meters. We need one kWh usage figure per account,
#     so we have to sum the amount, grouped by account.
#   dependencies:
#     - $I/kwh-meters.csv
#     - $I/months.csv
#   formula: >-
#     $C 'with vbwsum as
#     (select "end" - "start" as usage, period, account from meters),
#     vb as(select period, account, cast(sum(usage) as int) as usage
#     from vbwsum
#     group by period, account)
#     select vb.*, months.months from vb
#     inner join months on vb.period = months.period
#     and vb.account = months.account;'
#     --tables meters,months
#     $I/kwh-meters.csv
#     $I/months.csv
#     > $P/kwh-usage.csv
#   output:
#     - $P/kwh-usage.csv

bank-costs:
  help: extract totals for each supplier from initial transaction table.
  (ignore) doc: |
    This gets calculated so that it gets updated when bank transactions input file changes. Only Bank is actually used for now...
  dependencies:
    - $P/txns-fromto.csv
  formula: >-
    $C 'select "to" as account, cast(round(sum(amount), 2) as string) as amount from stdin
    where "to" = "Bank"
    group by "to";'
    < $P/txns-fromto.csv
    > $P/bank-costs.csv
  output:
    - $P/bank-costs.csv


total-val-txns:
  help: reverse value transactions for electricity usage variable and fixed, and banking costs
  (ignore) doc: |
    Doing this calculation is the main premise of this whole pipeline.
    To figure out how much all the users owe or are owed by the shared account,
    we allocate the final costs of energy delivered (from the invoices) to each account
    based on their proportion of kWh used (that gets done in the next step, indiv-val-txns).
    And then in the step txns-with-generated, we add these as 'virtual' transactions in reverse, from the Energy company to the shared account,
    and then from the shared account to the individual users.
    If everyone's account is settled correctly, then in the final balance every account should be at zero.
  dependencies:
    - $I/collective-usage-expenses.csv
    - $P/bank-costs.csv
  formula: >-
    $C '
    with bank as (
      select account || "total" as id, "2025-09-01" as date, account, amount as amount, "value of banking costs" as description
      from sup where account = "Bank"
    )
    select period || account || "valvar" as id,
      enddate as date,
      account as "from",
      exp_var as amount,
      "shared" as "to",
      "value of variable electricity expenses" as description
    from elec
    union all select period || account || "valfixed" as id,
      enddate as date,
      account as "from",
      exp_fixed as amount,
      "shared" as "to",
      "value of fixed electricity expenses" as description
    from elec

    union all select id, date, account, amount, "shared", description  from bank

    order by id, date, account;'
    --tables elec,sup
    $I/collective-usage-expenses.csv
    $P/bank-costs.csv
    > $P/total-val-txns.csv
  output:
    - $P/total-val-txns.csv

#Javascript version: uses percent-divide.js
indiv-val-txns:
  help: create txns for value consumed by each household
  (ignore) doc: |
    this step gets done in Javascript because sqlite can't do accurate floating point division.
    We need to allocate a proportion of the total expenses to each user:
      - collective energy usage * proportion of kWh per user
      - banking costs * months the user was participating (Sarah joined in the last month of period 2).
    based on
    The math is still not 100% accurate, but it's good enough for the current purposes.
    (the errors are to the order of 10e-10).
    Possible improvement: use Big.js to do the arithmetic, though there might still be a final rounding error at the end.
  dependencies:
    - $I/collective-usage-expenses.csv
    - $P/bank-costs.csv
    - $I/kwh-usage.csv
    - percent-divide.js
  formula: >-
    deno run -A percent-divide.js $I/collective-usage-expenses.csv $P/bank-costs.csv $I/kwh-usage.csv
    > $P/indiv-val-txns.csv
  output:
    - $P/indiv-val-txns.csv

#Web Origami version: uses data.ori
#indiv-val-txns:
#  help: create txns for value consumed by each household
#  (ignore) doc: |
#    this step gets done in Javascript because sqlite can't do accurate floating point division.
#    We need to allocate a proportion of the total expenses to each user:
#      - collective energy usage * proportion of kWh per user
#      - banking costs * months the user was participating (Sarah joined in the last month of period 2).
#    based on
#    The math is still not 100% accurate, but it's good enough for the current purposes.
#    (the errors are to the order of 10e-10).
#    Possible improvement: use Big.js to do the arithmetic, though there might still be a final rounding error at the end.
#  dependencies:
#    - $I/collective-usage-expenses.csv
#    - $P/bank-costs.csv
#    - $I/kwh-usage.csv
#    - data.ori
##there are more dependencies -- all the JS scripts used in data.ori -- but it doesn't make sense to include those all here.
#  formula: >-
#    ori 'Origami.csv data.ori/txns' > $P/indiv-val-txns.csv
#  output:
#    - $P/indiv-val-txns.csv


txns-with-other:
  help: manually add txns which were not in input bank transaction csv
  (ignore) doc: |
    we add this in the 'fromto' step.
  dependencies:
    - $P/txns-fromto.csv
    - $I/txns-other.csv
  formula: >-
    $C 'select * from "txns-fromto" union all
    select * from "txns-other"
    order by date, id;'
    $P/txns-fromto.csv
    $I/txns-other.csv
    > $P/txns-with-other.csv
  output:
    - $P/txns-with-other.csv

txns-with-generated:
  help: add the generated value transactions.
  dependencies:
    - $P/txns-with-other.csv
    - $P/total-val-txns.csv
    - $P/indiv-val-txns.csv
  formula: >-
    $C 'select * from "txns-with-other"
    union all select * from "total-val-txns"
    union all select * from "indiv-val-txns"
    order by date, id;'
    $P/txns-with-other.csv
    $P/total-val-txns.csv
    $P/indiv-val-txns.csv
    > $P/txns-with-generated.csv
  output:
    - $P/txns-with-generated.csv

txns:
  help: expand transactions to two rows per transaction.
  (ignore) doc: By expanding the transactions to two entries, one for the 'from' account and one for the 'to' account, we can do double-entry accounting (lite)
  dependencies:
    - $P/txns-with-generated.csv
  formula: >-
    $C 'select id, date,
    "from" as account,
    -amount as amount,
    description
    from stdin union all
    select id, date,
    "to" as account,
    amount,
    description
    from stdin
    order by date, id;'
    < $P/txns-with-generated.csv
    > $P/txns.csv
  output:
    - $P/txns.csv


balance:
  help: get total balance
  dependencies:
    - $P/txns.csv
  formula: >-
    $C 'with total as (
      select "total" as account, sum(amount) as amount
      from stdin
    ), pre as (
      select account,
      round(sum(amount),2) as amount
      from stdin
      where id is not null
      group by account
    )
    select * from pre union all select * from total;'
    < $P/txns.csv
    >$O/balance.csv
  output:
    - $O/balance.csv