From 81a1359f60d209f4a1df86b3f5ae51d3efbb4d82 Mon Sep 17 00:00:00 2001
From: Mitja Felicijan <mitja.felicijan@gmail.com>
Date: Sun, 11 Feb 2024 22:41:14 +0100
Subject: Added DNA k-mer post draft

---
 _layouts/base.html         |  14 ++++-
 _posts/2024-02-11-k-mer.md | 126 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 2 deletions(-)
 create mode 100644 _posts/2024-02-11-k-mer.md
diff --git a/_layouts/base.html b/_layouts/base.html
index aa4be41..12202d9 100644
--- a/_layouts/base.html
+++ b/_layouts/base.html
@@ -4,7 +4,7 @@
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width,initial-scale=1">
 
-  <link href="data:image/x-icon;base64,AAABAAEAEBAAAAEAIABoBAAAFgAAACgAAAAQAAAAIAAAAAEAIAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL69vf8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAv76+/8LBwQkAAAAAAAAAAAAAAAC+vb3/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL+9vf/Bv78JAAAAAAAAAAAAAAAAu7q6/wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC7ubr/vr29CAAAAAAAAAAAy8nJAZ6foP8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAnqGj/6GipAoAAAAAHLjU/xcXHf/BwsL/I8XY/yPK3v8XGiD/IbjL/yPF2f8XGiD/Fxkf/yLF2f8gnK3/Fxog/62ztv8fwNf/FRcd/x271v8mz93/GRsi/xkXHf8p097/GiIp/xobIv8p0t3/KdPe/xocIv8fYmr/KNPe/xoZH/8aHCL/J87c/xy81/8VFxz/IsPZ/8zS0/8XGiD/Ir/R/yPH2/8XGiD/Fxkf/yPH2/8dd4T/GBog/yPJ3f8jyNr/uru9/xcUGv8cudb/EhITDKi5vRKlvMP/RUpOERwcHRAdOj4QHTk8EBwdHRAdNTgQHTo/EBwcHRAcHB0QSGduEKW4vf+koqQfHzg+EBqz0ewSFRv7EyMr/xq51vsTERb7ExUb+xq41fsau9j7ExUb+xiPp/sZudb7ExUb+xMVG/sZuNX/GKvI/BIUGfMdvdn/IrfL/xcaIP8n1eb/J9Dh/xkcIf8ZGR7/J8/f/xxCSv8ZGyH/J9Dg/ybQ4P8ZHCL/FSQs/yPK3/8UExj/GE1b/ybS5P8ZGB7/Ghwj/ynW5P8p2Ob/Ghwi/yWrtv8p1eH/Ghwi/xocIv8p1uT/J8XT/xkcIv8m1un/Hb7d/xUYH/8hzOr/HtHu/xcaIf8XGB//I8vi/xgxOv8XGSD/I8rg/yPK4P8XGiD/GUFL/yPP6f8SERj/Fhkh/x3A4f8AAAAAJ2f9/ydr//8mZPH/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlYu38J2v//ydo/f8AAAAAAAAAAAd8/fkFqf//Iob8sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMY39awWr//8FfP3/AAAAAAAAAAAFm/7/SfD//wR+/f8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOB/f9B7v//BaX+/wAAAAAAAAAAQ878SAyZ/v9n1v4KAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADu9v8DDJb+/z3N/XgAAAAA3/sAAN/7AADf+wAA3/sAAAAAAAAAAAAAAAAAAN/7AAAAAAAAAAAAAAAAAAAAAAAAj/EAAI/5AACP8QAA3/sAAA==" rel="icon" type="image/x-icon" />
+    <link href="data:image/x-icon;base64,AAABAAEAEBAAAAEAIABoBAAAFgAAACgAAAAQAAAAIAAAAAEAIAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL69vf8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAv76+/8LBwQkAAAAAAAAAAAAAAAC+vb3/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL+9vf/Bv78JAAAAAAAAAAAAAAAAu7q6/wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC7ubr/vr29CAAAAAAAAAAAy8nJAZ6foP8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAnqGj/6GipAoAAAAAHLjU/xcXHf/BwsL/I8XY/yPK3v8XGiD/IbjL/yPF2f8XGiD/Fxkf/yLF2f8gnK3/Fxog/62ztv8fwNf/FRcd/x271v8mz93/GRsi/xkXHf8p097/GiIp/xobIv8p0t3/KdPe/xocIv8fYmr/KNPe/xoZH/8aHCL/J87c/xy81/8VFxz/IsPZ/8zS0/8XGiD/Ir/R/yPH2/8XGiD/Fxkf/yPH2/8dd4T/GBog/yPJ3f8jyNr/uru9/xcUGv8cudb/EhITDKi5vRKlvMP/RUpOERwcHRAdOj4QHTk8EBwdHRAdNTgQHTo/EBwcHRAcHB0QSGduEKW4vf+koqQfHzg+EBqz0ewSFRv7EyMr/xq51vsTERb7ExUb+xq41fsau9j7ExUb+xiPp/sZudb7ExUb+xMVG/sZuNX/GKvI/BIUGfMdvdn/IrfL/xcaIP8n1eb/J9Dh/xkcIf8ZGR7/J8/f/xxCSv8ZGyH/J9Dg/ybQ4P8ZHCL/FSQs/yPK3/8UExj/GE1b/ybS5P8ZGB7/Ghwj/ynW5P8p2Ob/Ghwi/yWrtv8p1eH/Ghwi/xocIv8p1uT/J8XT/xkcIv8m1un/Hb7d/xUYH/8hzOr/HtHu/xcaIf8XGB//I8vi/xgxOv8XGSD/I8rg/yPK4P8XGiD/GUFL/yPP6f8SERj/Fhkh/x3A4f8AAAAAJ2f9/ydr//8mZPH/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlYu38J2v//ydo/f8AAAAAAAAAAAd8/fkFqf//Iob8sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMY39awWr//8FfP3/AAAAAAAAAAAFm/7/SfD//wR+/f8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOB/f9B7v//BaX+/wAAAAAAAAAAQ878SAyZ/v9n1v4KAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADu9v8DDJb+/z3N/XgAAAAA3/sAAN/7AADf+wAA3/sAAAAAAAAAAAAAAAAAAN/7AAAAAAAAAAAAAAAAAAAAAAAAj/EAAI/5AACP8QAA3/sAAA==" rel="icon" type="image/x-icon" />
 
     <title>
       {% if page.title %}{{ page.title | escape }} - {{ site.title | escape }}
@@ -17,7 +17,7 @@
 
     <style>
       :root {
-        --body-max-width: 1600px;
+        --body-max-width: 1200px;
         --border-color: gainsboro;
         --border-size: 1px;
         --border-style: solid;
@@ -61,6 +61,10 @@
         text-decoration: none;
       }
 
+      a:hover {
+        text-decoration: underline;
+      }
+
       h1, h2, h3 {
         line-height: initial;
       }
@@ -247,5 +251,11 @@
       <hr>
       <p><small>This page's most recent build occurred on  {{ site.time | date: "%A, %-d %B, %Y" }} and is also available as <a href="/feed.xml" target="_blank">RSS feed</a>.</small></p>
     </footer>
+
+    {% if page.mathjax %}
+      <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
+      <script src="https://cdn.jsdelivr.net/npm/mathjax@3.0.1/es5/tex-mml-chtml.js" async></script>
+    {% endif %}
+
   </body>
 </html>
diff --git a/_posts/2024-02-11-k-mer.md b/_posts/2024-02-11-k-mer.md
new file mode 100644
index 0000000..f8f6e17
--- /dev/null
+++ b/_posts/2024-02-11-k-mer.md
@@ -0,0 +1,126 @@
+---
+title: "Navigating the genome using k-mers for DNA analysis and visualization"
+permalink: /navigating-the-genome-using-k-mers-for-dna-analysis-and-visualization.html
+date: 2024-02-11T01:04:28+02:00
+layout: post
+type: post
+mathjax: yes
+draft: true
+---
+
+## Brief introduction to K-mer
+
+A "k-mer" refers to all the possible substrings of length \\(k\\) contained in a
+string, which is commonly used in computational biology and bioinformatics. In
+the context of DNA, RNA, or protein sequences, a k-mer is a sequence of \\(k\\)
+nucleotides (for DNA and RNA) or amino acids (for proteins).
+
+The concept of k-mers is fundamental in various bioinformatics applications,
+including genome assembly, sequence alignment, and identification of repeat
+sequences. By analyzing the frequency and distribution of k-mers within a
+sequence or set of sequences, researchers can infer structural characteristics,
+identify genetic variants, and compare genomic or proteomic compositions between
+different organisms or conditions.
+
+For example, in genome assembly, k-mers are used to reconstruct the sequence of
+a genome from a collection of short sequencing reads. By finding overlaps
+between the k-mers derived from these reads, assembly algorithms can piece
+together contiguous sequences (contigs), which represent longer sections of the
+genome.
+
+The choice of \\(k\\) (the length of the k-mer) is crucial and depends on the
+specific application. A larger \\(k\\) provides more specificity (useful for
+distinguishing between closely related sequences), while a smaller \\(k\\)
+offers greater sensitivity (useful for detecting repeats or low-complexity
+regions). However, the computational resources required increase with \\(k\\),
+as there are \\(4^k\\) possible k-mers for nucleotide sequences (due to the four
+types of nucleotides: A, T, C, G) and \\(20^k\\) for amino acid sequences (due
+to the twenty standard amino acids).
+
+## K-mer counting
+
+K-mer counting is a fundamental process in bioinformatics used for analyzing the
+frequency of k-mers (subsequences of length \\(k\\)) in DNA, RNA, or protein
+sequences. Efficient k-mer counting is crucial for various applications such as
+genome assembly, metagenomics, and sequence comparison. The implementation
+typically involves parsing a sequence into all possible k-mers and then counting
+the occurrences of each unique k-mer. Here's a general approach to implementing
+k-mer counting:
+
+### Reading the Sequences
+
+The first step involves reading the genetic or protein sequences from files,
+which are often in formats like FASTA or FASTQ. These files contain one or
+multiple sequences that will be processed to extract k-mers.
+
+### Generating K-mers
+
+For each sequence, generate all possible subsequences of length \\(k\\). This is
+done by sliding a window of size \\(k\\) across the sequence, one nucleotide (or
+amino acid) at a time, and extracting the subsequence within this window.
+
+### Counting K-mers
+
+The extracted k-mers are then counted. This can be achieved using various data
+structures:
+
+- **Hash Tables (Dictionaries)**: They offer an efficient way to keep track of
+  k-mer counts, with k-mers as keys and their frequencies as values. This
+  approach is straightforward but can become memory-intensive with large
+  datasets or large values of \\(k\\).
+- **Suffix Trees or Arrays**: These data structures are more space-efficient for
+  k-mer counting, especially for large datasets. They allow for efficient
+  retrieval of k-mer occurrences but are more complex to implement.
+- **Bloom Filters and Count-Min Sketch**: For very large datasets, probabilistic
+  data structures like Bloom filters or Count-Min Sketch can estimate k-mer
+  counts using significantly less memory, at the cost of a controlled error
+  rate.
+
+### Handling Memory and Performance Issues
+
+K-mer counting can be memory-intensive, especially for large values of \\(k\\) or
+large datasets. Optimizations include:
+
+- **Compressing K-mers**: Representing k-mers using a binary format rather than
+  strings can save memory.
+- **Parallel Processing**: Distributing the k-mer counting task across multiple
+  processors or machines can significantly speed up the process.
+- **Minimizing I/O Operations**: Efficiently reading and processing sequences
+  from files in chunks reduces I/O overhead.
+
+### Post-processing
+
+After counting, the k-mer frequencies can be used directly for analyses or can
+undergo further processing, such as filtering rare k-mers, which are often
+errors, or normalizing counts for comparative analysis.
+
+### Implementation Example
+
+Here's a simple Python example using a dictionary for k-mer counting:
+
+```python
+def count_kmers(sequence, k):
+    kmer_counts = {}
+    for i in range(len(sequence) - k + 1):
+        kmer = sequence[i:i+k]
+        if kmer in kmer_counts:
+            kmer_counts[kmer] += 1
+        else:
+            kmer_counts[kmer] = 1
+    return kmer_counts
+
+# Example usage
+sequence = "ATGCGATGATCTGATG"
+k = 3
+kmer_counts = count_kmers(sequence, k)
+print(kmer_counts)
+```
+
+This code snippet counts the occurrences of each 3-mer in a given sequence.
+
+For real-world applications, especially those involving large datasets, consider
+using specialized bioinformatics tools like Jellyfish, KMC, or khmer, which are
+optimized for efficiency and scalability.
+
+Now that we have the basics out of the way we can start implementing basic k-mer
+counter in C.
-- 
cgit v1.2.3